In [1]:
import pandas as pd
from scipy.signal import correlate

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import time

In [3]:
def calculate_correlation(poly_division, time_interval, max_lags = 5):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    
    # create lagged columns
    df_new = df.copy()
    for lag in range(-max_lags, max_lags + 1):
        df_new = pd.merge(
            df_new,
            df.groupby("id_poly").apply(lambda x : x.set_index("date").shift(lag), include_groups=False).reset_index(),
            on=["id_poly", "date"],
            how = "inner",
            suffixes = ("", f"_lag_{lag}")
        )

    df_new.to_csv(f"data/polygon_data/{poly_division}_{time_interval}_lagged.csv", index=False)
    corr_matrix = df_new.iloc[:, 2:].corr()
    features = df.columns.tolist()[2:]

    result = []

    for i, feat_i in enumerate(features):
        correlations = corr_matrix[feat_i].to_dict()
        for j, feat_j in enumerate(features):
            if feat_i == feat_j:
                continue
            

            correlations_values = [correlations[feat_j + f"_lag_{lag}"] for lag in range(-max_lags, max_lags + 1)]
            # get lag and value of maximum absolute correlation
            max_corr = max(correlations_values, key=abs)
            max_corr_lag = correlations_values.index(max_corr) - max_lags

            result.append({
                "row": feat_i,
                "column": feat_j,
                "value": max_corr,
                "lag": max_corr_lag
            })

            

    result = pd.DataFrame(result)
    result.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [12]:
calculate_correlation("SpCenterCensus5k", "Period2", 10)

Time taken: 0.34967970848083496


In [9]:
calculate_correlation("BLACities", "Year", 10)

Time taken: 1.4154365062713623


In [5]:
calculate_correlation("BLACities", "Year2", 10)

In [4]:
calculate_correlation("NYBlocks", "Period1", 10)