In [1]:
from PIL import Image
from matplotlib.colors import rgb2hex
import pandas as pd
import geopandas as gpd
from sklearn.manifold import TSNE
from tslearn.metrics import dtw, cdist_dtw, cdist_soft_dtw, cdist_soft_dtw_normalized, soft_dtw
import glob
from tqdm import tqdm
import numpy as np

import torch
from torch.utils.data import DataLoader
import argparse

from ae import *

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
def get_colors_polygons(poly_division):
    file = gpd.read_file(f"data/shapefiles/{poly_division}.geojson").to_crs("EPSG:6933")
    centroids = file.centroid.apply(lambda x: (x.x, x.y)).to_list()
    centroids = np.array(centroids)
    for i in range(2):
        centroids[:, i] -= centroids[:, i].min()
        centroids[:, i] /= centroids[:, i].max()
        centroids[:, i] *= 511
    centroids = centroids.astype(int)

    img = Image.open("data/misc/bremm.png")
    # transform to array
    img = np.array(img)

    colors = []
    for i in centroids:
        colors.append(rgb2hex(img[i[1], i[0]] / 255))
    return colors

In [3]:
def get_dtw_matrix(ts, dtw_args={}):
    n = len(ts)
    dtw_matrix = np.zeros((n, n))
    for i in tqdm(range(n)):
        for j in range(i, n):
            dtw_matrix[i, j] = dtw(ts[i], ts[j], **dtw_args)
            dtw_matrix[j, i] = dtw_matrix[i, j]
    return dtw_matrix

In [4]:
parser = argparse.ArgumentParser()
args = parser.parse_args('')
args.batch_size = 32
args.embedding_dim = 32
args.n_layers = 1
args.device = "cuda" if torch.cuda.is_available() else "cpu"
args.dropout = False
args.use_bn = False
args.lr = 0.001
args.epoch = 200


def projection_coeffs_deep(poly_division, time_interval, spatial = False):
    if spatial:
        coeffs = pd.read_csv(f"data/coeffs_spatial/{poly_division}_{time_interval}.csv")
    else:
        coeffs = pd.read_csv(f"data/coeffs/{poly_division}_{time_interval}.csv")

    colors = get_colors_polygons(poly_division)
    ts = []
    for v in coeffs.type.unique():
        ts.append(coeffs[coeffs.type == v].pivot(index = "id_poly", columns = "date", values = ["mean_freq_3"]).values)
    ts = np.array(ts)
    ts = ts.transpose(1, 0, 2)
    ts = [t for t in ts]


    train_idx = np.random.choice(len(ts), int(0.8 * len(ts)), replace = False)
    val_idx = np.array([i for i in range(len(ts)) if i not in train_idx])
    ts_train = [torch.tensor(ts[i], dtype = torch.float32) for i in train_idx]
    ts_val = [torch.tensor(ts[i], dtype = torch.float32) for i in val_idx]
    ts_train = DataLoader(ts_train, batch_size = args.batch_size, shuffle = False)
    ts_val = DataLoader(ts_val, batch_size = args.batch_size, shuffle = False)
    
    model = AutoencoderConv(
        input_dim = ts[0].shape[0],
        encoding_dim = 32,
        seq_len = ts[0].shape[1],
        h_dims = [16, 64],
        h_activ = nn.ReLU(),
        out_activ = nn.Identity()
    )

    train_loss, val_loss = train_model(
        model,
        ts_train,
        ts_val,
        args,
        verbose = False
    )

    ts_train = [torch.tensor(t, dtype = torch.float32) for t in ts]
    ts_train = DataLoader(ts_train, batch_size = args.batch_size, shuffle = False)

    encodings = get_encodings(model, ts_train, args)
        
    tsne = TSNE(n_components=2, random_state=0)
    proj = tsne.fit_transform(encodings)
    proj = (proj - proj.min(axis=0)) / (proj.max(axis=0) - proj.min(axis=0))
    proj = proj * 0.95 + 0.025
    projections = pd.DataFrame(proj, columns = ["x", "y"])
    projections["id_poly"] = np.arange(len(ts))
    projections["color"] = colors

    ts = np.array(ts) # (n, n_f, t)
    # for each n and n_f, compute the mean of the time series
    mean_ts = ts.mean(axis = 2)
    for i, v in enumerate(coeffs.type.unique()):
        projections[v] = mean_ts[:, i]

    if spatial:
        projections.to_csv(f"data/projections_spatial/{poly_division}_{time_interval}.csv", index=False)
    else:
        projections.to_csv(f"data/projections/{poly_division}_{time_interval}.csv", index=False)

In [5]:
def projection_coeffs(poly_division, time_interval, spatial = False):
    if spatial:
        coeffs = pd.read_csv(f"data/coeffs_spatial/{poly_division}_{time_interval}.csv")
    else:
        coeffs = pd.read_csv(f"data/coeffs/{poly_division}_{time_interval}.csv")

    

    columns_to_keep = ["id_poly", "date", "type"] + [col for col in coeffs.columns if "3" in col]
    feature_names = coeffs["type"].unique()
    coeffs = coeffs[columns_to_keep]

    colors = get_colors_polygons(poly_division)
    
    projections = []
    for i, feat in enumerate(feature_names):
        coeffs_ = coeffs[coeffs["type"] == feat].copy()
        coeffs_ = coeffs_.pivot(index = "id_poly", columns = "date", values = ["mean_freq_3"]).values
        
        #tsne = TSNE(n_components=2, random_state=0, metric = lambda x, y: soft_dtw(x, y, gamma=1.))
        tsne = TSNE(n_components=2, random_state=0, metric="cosine")
        #dtw_matrix = get_dtw_matrix(coeffs_, dtw_args={"global_constraint": "sakoe_chiba", "sakoe_chiba_radius": 5})
        #dtw_matrix = cdist_dtw(coeffs_, global_constraint="sakoe_chiba", sakoe_chiba_radius=5)

        proj = tsne.fit_transform(coeffs_)

        #proj = tsne.fit_transform(dtw_matrix)
        # make projection be inside [0.025, 0.975]
        proj = (proj - proj.min(axis=0)) / (proj.max(axis=0) - proj.min(axis=0))
        proj = proj * 0.95 + 0.025
        df = pd.DataFrame(proj, columns=[f"{feat}_x", f"{feat}_y"])
        df["mean_coeff"] = coeffs_.mean(axis=1)
        projections.append(df)

    projections = pd.concat(projections, axis=1)
    projections["id_poly"] = np.arange(len(projections))
    projections["color"] = colors
    if spatial:
        projections.to_csv(f"data/projections_spatial/{poly_division}_{time_interval}.csv", index=False)
    else:
        projections.to_csv(f"data/projections/{poly_division}_{time_interval}.csv", index=False)

In [5]:
for time_interval in ["Month", "Day", "3days", "5days"]:
    for poly_division in ["SpGrid", "SpDistricts", "SpCenterCensus2k", "SpCenterCensus5k"]:
        projection_coeffs(poly_division, time_interval)
        projection_coeffs(poly_division, time_interval, True)

In [5]:
for time_interval in ["Day", "3days", "5days"]:
    for poly_division in ["NYBlocks"]:
        projection_coeffs(poly_division, time_interval)
        projection_coeffs(poly_division, time_interval, True)

In [13]:
projection_coeffs("BLACities", "Year")
projection_coeffs("BLACities", "Year", True)

In [28]:
projection_coeffs_deep("SpCenterCensus5k", "Period1")

In [6]:
projection_coeffs_deep("SpCenterCensus5k", "Period2")
projection_coeffs_deep("SpCenterCensus5k", "Period2", True)

In [None]:
projection_coeffs_deep("BLACities", "Year")
projection_coeffs_deep("BLACities", "Year", True)