In [4]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH
from itertools import zip_longest
from time import time
from collections import Counter
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

import numpy as np
import pandas as pd
import time

def euclidean_distance(x, y):
    """Compute the squared Euclidean distance between two points."""
    return (x - y) ** 2

def dtw_distance(seq_a, seq_b):
    """Compute DTW distance using Euclidean distance as cost function."""
    n, m = len(seq_a), len(seq_b)
    dtw_matrix = np.full((n + 1, m + 1), np.inf)
    dtw_matrix[0, 0] = 0

    # Compute DTW cost matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = euclidean_distance(seq_a[i - 1], seq_b[j - 1])
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i - 1, j],    # Insertion
                dtw_matrix[i, j - 1],    # Deletion
                dtw_matrix[i - 1, j - 1] # Match
            )

    return dtw_matrix[n, m]  # Return the final DTW distance

# Load dataset
data = pd.read_csv("dtw_test.csv")  # Replace with actual filename

# Process each pair of time series
results = []
start_time = time.time()

for idx, row in data.iterrows():
    seq_a = np.array(eval(row['series_a']))  # Convert string to list
    seq_b = np.array(eval(row['series_b']))
    distance = dtw_distance(seq_a, seq_b)
    results.append([idx, distance])  # Ensure ID starts at 1

# Measure execution time
end_time = time.time()
total_time = end_time - start_time

# Save results to CSV
df_results = pd.DataFrame(results, columns=["id", "DTW distance"])
df_results.to_csv("dtw.csv", index=False)

print(f"Total time taken: {total_time:.4f} seconds")


Total time taken: 1559.7498 seconds
