In [1]:
import pandas as pd
import numpy as np
from scipy.signal import correlate
from tqdm import tqdm

from scipy.stats import pearsonr

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [61]:
def calculate_correlation(poly_division, time_interval, max_lags = 5):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
    
    features = df.columns.tolist()[2:]
    dist_matrix = np.zeros((len(features), len(features)))

    for i, sample in tqdm(enumerate(signal_multi)):
        dist_matrix_ = np.zeros((len(features), len(features)))
        for m in range(len(features)):
            sample[:, m] -= sample[:, m].mean()
            sample[:, m] /= sample[:, m].std() + 1e-6
            for n in range(len(features)):
                if m == n:
                    continue
                # calculate cross-correlation

                
                sample[:, n] -= sample[:, n].mean()
                sample[:, n] /= sample[:, n].std() + 1e-6
                xcorr = correlate(sample[:, m], sample[:, n], mode='full')
                xcorr = max(xcorr, key=abs)
                dist_matrix_[m, n] = xcorr
                # normalize cross-correlation
                div = np.sqrt(np.sum(sample[:, m]**2) * np.sum(sample[:, n]**2))
                div = div if div != 0 else 1
                dist_matrix_[m, n] /= div

        dist_matrix += dist_matrix_

    dist_matrix /= len(signal_multi)

    
    result = [
        
    ]

    for i, feat_i in enumerate(features):
        for j, feat_j in enumerate(features):
            if feat_i == feat_j:
                continue
            

            result.append({
                "row": feat_i,
                "column": feat_j,
                "value": dist_matrix[i, j],
                "lag": -777
            })

            

    result = pd.DataFrame(result)
    result.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [97]:
def calculate_correlation(poly_division, time_interval, max_lags = 5):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
    
    features = df.columns.tolist()[2:]
    dist_matrix = np.zeros((len(features), len(features)))

    for i, sample in tqdm(enumerate(signal_multi)):
        dist_matrix_ = np.zeros((len(features), len(features)))
        for m in range(len(features)):
            if np.sum(sample[:, m]) == 0:
                continue
            for n in range(len(features)):
                if m == n:
                    continue
                if np.sum(sample[:, n]) == 0:
                    continue
                # calculate correlation
                
                xcorr = pearsonr(sample[:, m], sample[:, n])[0]
                if xcorr != 1:
                    xcorr = np.arctanh(xcorr)
                dist_matrix_[m, n] = xcorr

        
        dist_matrix += dist_matrix_

        dist_matrix__ = dist_matrix / (i + 1)
        dist_matrix__ = np.tanh(dist_matrix__)
        if dist_matrix__.max() >= 1:
            print(i, dist_matrix_, dist_matrix)
            return

    dist_matrix /= len(signal_multi)
    dist_matrix[dist_matrix > 0] = np.tanh(dist_matrix[dist_matrix > 0])
    
    result = [
        
    ]

    for i, feat_i in enumerate(features):
        for j, feat_j in enumerate(features):
            if feat_i == feat_j:
                continue
            

            result.append({
                "row": feat_i,
                "column": feat_j,
                "value": dist_matrix[i, j],
                "lag": -777
            })

            

    result = pd.DataFrame(result)
    result.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [2]:
def calculate_correlation(poly_division, time_interval, max_lags = 5):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    
    # create lagged columns
    df_new = df.copy()
    for lag in range(-max_lags, max_lags + 1):
        df_new = pd.merge(
            df_new,
            df.groupby("id_poly").apply(lambda x : x.set_index("date").shift(lag), include_groups=False).reset_index(),
            on=["id_poly", "date"],
            how = "inner",
            suffixes = ("", f"_lag_{lag}")
        )

    df_new.to_csv(f"data/polygon_data/{poly_division}_{time_interval}_lagged.csv", index=False)
    corr_matrix = df_new.iloc[:, 2:].corr()
    features = df.columns.tolist()[2:]

    result = []

    for i, feat_i in enumerate(features):
        correlations = corr_matrix[feat_i].to_dict()
        for j, feat_j in enumerate(features):
            if feat_i == feat_j:
                continue
            

            correlations_values = [correlations[feat_j + f"_lag_{lag}"] for lag in range(-max_lags, max_lags + 1)]

            # get lag and value of maximum absolute correlation
            max_corr = max(correlations_values, key=abs)
            max_corr_lag = correlations_values.index(max_corr) - max_lags

            result.append({
                "row": feat_i,
                "column": feat_j,
                "value": max_corr,
                "lag": max_corr_lag
            })

            

    result = pd.DataFrame(result)
    result.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [6]:
calculate_correlation("SpCenterCensus5k", "Period2", 0)

In [7]:
calculate_correlation("BLACities", "Year2", 0)

In [8]:
calculate_correlation("NYBlocks", "Period1", 0)