In [100]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from tslearn.metrics import cdist_dtw, dtw
from scipy.signal import correlate

In [111]:
len(pd.read_csv("data/polygon_data/NYBlocks_Period1.csv").date.unique())

186

In [102]:
def dtw_similarity(multi_ts, r):
    assert multi_ts.ndim == 2
    k = multi_ts.shape[1]
    out = np.zeros((k, k))
    for i in range(k):
        for j in range(i, k):
            d = dtw(multi_ts[:, i], multi_ts[:, j] , global_constraint="sakoe_chiba", sakoe_chiba_radius=r)
            out[i, j] = d
            out[j, i] = d
    return out

In [106]:
def save_dtw_dist_matrix(poly_division, time_interval, r):
    # load and normalize signals

    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    features = df.columns[2:]
    signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
    for i in range(signal_multi.shape[2]):
        mu = np.mean(signal_multi[:, :, i])
        sigma = np.std(signal_multi[:, :, i])
        m = np.max(signal_multi[:, :, i])

        if sigma == 0:
            sigma = 1

        if m == 0:
            m = 1

        #signal_multi[:, :, i] = (signal_multi[:, :, i] - mu) / sigma
        signal_multi[:, :, i] = signal_multi[:, :, i] / m

    # calculate dtw distance matrix
    geral_dist = np.zeros((signal_multi.shape[2], signal_multi.shape[2]))
    for i in tqdm(range(signal_multi.shape[0])):
        sample_dist = dtw_similarity(signal_multi[i], r)
        geral_dist += sample_dist   

    geral_dist /= signal_multi.shape[0] 

    df = []
    for i in range(geral_dist.shape[0]):
        for j in range(i, geral_dist.shape[1]):
            df.append({
                "row": features[i],
                "column": features[j],
                "value": geral_dist[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [97]:
def save_dtw_dist_matrix(poly_division, time_interval):
    # load and normalize signals

    coeffs = pd.read_csv(f"data/coeffs/{poly_division}_{time_interval}.csv")

    features = coeffs["type"].unique()

    ts = []
    for v in coeffs.type.unique():
        ts.append(coeffs[coeffs.type == v].pivot(index = "id_poly", columns = "date", values = ["mean_freq_3"]).values)
    ts = np.array(ts)
    ts = ts.transpose(1, 2, 0)

    print(ts.shape)
    signal_multi = ts



    # calculate dtw distance matrix
    geral_dist = np.zeros((signal_multi.shape[2], signal_multi.shape[2]))
    for i in tqdm(range(signal_multi.shape[0])):
        sample_dist = dtw_similarity(signal_multi[i]) #, r = 5)
        geral_dist += sample_dist   

    geral_dist /= signal_multi.shape[0] 

    df = []
    for i in range(geral_dist.shape[0]):
        for j in range(i, geral_dist.shape[1]):
            df.append({
                "row": features[i],
                "column": features[j],
                "value": geral_dist[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [105]:
save_dtw_dist_matrix("SpCenterCensus5k", "Period1", r = 6)
save_dtw_dist_matrix("SpCenterCensus5k", "Period2", r = 6)

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 5000/5000 [00:14<00:00, 341.66it/s]
  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 5000/5000 [00:08<00:00, 604.21it/s]


In [107]:
save_dtw_dist_matrix("SpCenterCensus5k", "Period2", r = 6)

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 5000/5000 [00:08<00:00, 605.84it/s]


In [4]:
save_dtw_dist_matrix("SpCenterCensus2k", "Period1")
save_dtw_dist_matrix("SpCenterCensus2k", "Period2")

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 2000/2000 [00:06<00:00, 296.64it/s]
  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 2000/2000 [00:03<00:00, 598.08it/s]


In [4]:
save_dtw_dist_matrix("NYBlocks", "Period1")

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 1168/1168 [00:02<00:00, 405.08it/s]


In [4]:
save_dtw_dist_matrix("BLACities", "Year")

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
100%|██████████| 773/773 [00:03<00:00, 245.36it/s]
