In [1]:
import pandas as pd
from scipy.signal import correlate

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
def calculate_correlation(poly_division, time_interval, max_lags = 5):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    
    # create lagged columns
    df_new = df.copy()
    for lag in range(-max_lags, max_lags + 1):
        df_new = pd.merge(
            df_new,
            df.groupby("id_poly").apply(lambda x : x.set_index("date").shift(lag), include_groups=False).reset_index(),
            on=["id_poly", "date"],
            how = "inner",
            suffixes = ("", f"_lag_{lag}")
        )

    corr_matrix = df_new.iloc[:, 2:].corr()

    features = df.columns.tolist()[2:]

    result = []

    for i, feat_i in enumerate(features):
        correlations = corr_matrix[feat_i].to_dict()
        for j, feat_j in enumerate(features):
            

            correlations_values = [correlations[feat_j + f"_lag_{lag}"] for lag in range(-max_lags, max_lags + 1)]
            # get lag and value of maximum absolute correlation
            max_corr = max(correlations_values, key=abs)
            max_corr_lag = correlations_values.index(max_corr) - max_lags

            result.append({
                "row": feat_i,
                "column": feat_j,
                "value": max_corr,
                "lag": max_corr_lag
            })

            

    result = pd.DataFrame(result)
    result.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [17]:
calculate_correlation("SpCenterCensus5k", "Period2", 10)

In [18]:
calculate_correlation("BLACities", "Year", 10)

In [19]:
calculate_correlation("BLACities", "Year2", 10)

In [20]:
calculate_correlation("NYBlocks", "Period1", 10)

In [75]:
def calculate_correlation(poly_division, time_interval):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    features = df.columns.tolist()[2:]
    df = df.iloc[:, 2:].copy()
    correlation = df.corr().values

    df = []
    for i in range(len(features)):
        for j in range(i, len(features)):
            
            df.append({
                "row": features[i],
                "column": features[j],
                "value": correlation[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [66]:
def create_lag_columns(df, lags, columns):
    df = df.sort_values("date")
    for col in columns:
        for lag in lags:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)

    df = df.dropna()
    return df

def calculate_correlation(poly_division, time_interval):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    features = df.columns.tolist()[2:]
    new_df = []
    for id_poly in df["id_poly"].unique():
        df_poly = df[df["id_poly"] == id_poly]
        new_df.append(create_lag_columns(df_poly, range(-6, 6, 1), features))

    df = pd.concat(new_df)
    df = df.iloc[:, 2:].copy()
    correlation = df.corr()

    df = []
    for i in range(features):
        row = correlation[i]
        for j in range(features):
            
            df.append({
                "row": features[i],
                "column": features[j],
                "value": correlation[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [76]:
calculate_correlation("SpCenterCensus5k", "Period2")

In [77]:
calculate_correlation("BLACities", "Year")

In [78]:
calculate_correlation("BLACities", "Year2")

In [67]:
df = calculate_correlation("BLACities", "Year")

  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col].shift(lag)
  df[f"{col}_lag_{lag}"] = df[col]

In [4]:
from tqdm import tqdm
import numpy as np

def calculate_correlation(poly_division, time_interval):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    features = df.columns.tolist()[2:]
    correlation_matrix = np.zeros((len(features), len(features)))
    n_poly = len(df.id_poly.unique())

    df = df.groupby(["date"]).mean().reset_index()
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            xcorr =  correlate(df[features[i]], df[features[j]], mode="same") 
            norm = (np.sqrt(np.sum(df[features[i]] ** 2) * (np.sum(df[features[j]] ** 2))))
            xcorr /= norm if norm != 0 else 1
            correlation_matrix[i, j] += np.max(xcorr)


    # for id_poly in tqdm(range(n_poly)):
    #     data = df[df.id_poly == id_poly].iloc[:, 2:].values

    #     for i in range(len(features)):
    #         for j in range(i + 1, len(features)):
    #             xcorr =  correlate(data[:, i], data[:, j], mode="same") 
    #             norm = (np.sqrt(np.sum(data[:, i] ** 2) * (np.sum(data[:, j] ** 2))))
    #             xcorr /= norm if norm != 0 else 1
    #             correlation_matrix[i, j] += np.max(xcorr)
        
    
    correlation_matrix /= n_poly

    df = []
    for i in range(correlation_matrix.shape[0]):
        for j in range(i, correlation_matrix.shape[1]):
            df.append({
                "row": features[i],
                "column": features[j],
                "value": correlation_matrix[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [86]:
from tqdm import tqdm
import numpy as np

def calculate_correlation(poly_division, time_interval):
    threshold = 0.9
    coeffs = pd.read_csv(f"data/coeffs/{poly_division}_{time_interval}.csv")

    features = coeffs["type"].unique()

    ts = []
    for v in coeffs.type.unique():
        ts.append(coeffs[coeffs.type == v].pivot(index = "id_poly", columns = "date", values = ["mean_freq_3"]).values)
    ts = np.array(ts)

    ts_ = ts.copy()
    ts_ = ts_.reshape(ts_.shape[0], -1)
    q = np.quantile(ts_, threshold, axis = 1)

    #ts = (ts > threshold).astype(int)
    ts = ts.transpose(1, 2, 0)
    ts = (ts > q[None, None, :]).astype(float)


    correlation_matrix = np.zeros((len(features), len(features)))

    for id_poly in tqdm(range(ts.shape[0])):
        data = ts[id_poly]

        for i in range(len(features)):
            row_i = data[:, i]
            if row_i.std() == 0:
                continue

            row_i = (row_i - row_i.mean()) / (row_i.std() * len(row_i))
            for j in range(len(features)):
                row_j = data[:, j]
                if row_j.std() == 0:
                    continue
                row_j = (row_j - row_j.mean()) / (row_j.std())
                xcorr =  correlate(row_i, row_j, mode="same")
                
                correlation_matrix[i, j] += np.abs(np.max(xcorr))
        
    
    correlation_matrix /= ts.shape[0]

    correlation_matrix = pd.DataFrame(correlation_matrix, columns = features, index = features)
    return correlation_matrix

    df = []
    for i in range(correlation_matrix.shape[0]):
        for j in range(i, correlation_matrix.shape[1]):
            df.append({
                "row": features[i],
                "column": features[j],
                "value": correlation_matrix[i, j]
            })
    df = pd.DataFrame(df)

    df.to_csv(f"data/similarity_matrix/{poly_division}_{time_interval}.csv", index=False)

In [87]:
d = calculate_correlation("BLACities", "Year")

100%|██████████| 773/773 [00:00<00:00, 1962.05it/s]


In [5]:
calculate_correlation("SpCenterCensus5k", "Period2")

In [20]:
calculate_correlation("SpCenterCensus5k", "Period1")
calculate_correlation("SpCenterCensus5k", "Period2")

100%|██████████| 5000/5000 [00:08<00:00, 573.35it/s]
100%|██████████| 5000/5000 [00:05<00:00, 907.65it/s]


In [21]:
calculate_correlation("SpCenterCensus2k", "Period1")
calculate_correlation("SpCenterCensus2k", "Period2")

100%|██████████| 2000/2000 [00:02<00:00, 802.43it/s]
100%|██████████| 2000/2000 [00:02<00:00, 977.86it/s] 


In [22]:
calculate_correlation("NYBlocks", "Period1")

100%|██████████| 1168/1168 [00:01<00:00, 867.89it/s]


In [23]:
calculate_correlation("BLACities", "Year")

100%|██████████| 773/773 [00:01<00:00, 750.54it/s]


In [24]:
calculate_correlation("BLACities", "Year2")

100%|██████████| 773/773 [00:00<00:00, 776.72it/s]
