In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import wavelet_transform as wt
import signal_processing as sp
from create_graph import create_graph
import scipy.sparse


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
%matplotlib inline

In [4]:
def precompute_wavelet(poly_division, time_interval):
    df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
    features = df.columns[2:]
    try:
        adj_matrix = np.load(f"data/adj_matrix/{poly_division}.npy")
    except:
        adj_matrix = scipy.sparse.load_npz(f"data/adj_matrix/{poly_division}.npz").toarray()
    signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
    df_res = df[["id_poly", "date"]].copy()

    for k, feature in enumerate(features):
        signal = signal_multi[:, :, k]

        n_timestamps = signal_multi.shape[1]

        wav =  wt.WaveletTransform(
            adj_matrix,
            n_timestamps,
            graph_product="strong",
            n_filters=32,
            kernel="abspline",
            scaling_function=False,
            method="chebyshev",
            order_chebyshev=30,
        )
        coeffs = wav.transform(signal)
        coeffs = sp.get_scaled_coefficients(coeffs)
        # create column for coeffs in the df_res
        for i in range(coeffs.shape[-1]):
            df_res[f"{feature}_coeff_{i}"] = coeffs[:, :, i].flatten()
        df_res = df_res.copy()
        df_res[df_res.columns[2:]] = df_res[df_res.columns[2:]].astype(np.float32)
        df_res.to_csv(f"data/coeffs/{feature}_{poly_division}_{time_interval}.csv", index=False)
        df_res = df[["id_poly", "date"]].copy()

In [3]:
poly_division = "SpDistricts"
time_interval = "Month"
df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
features = df.columns[2:]
adj_matrix = np.load(f"data/adj_matrix/{poly_division}.npy")
signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
df_res = df[["id_poly", "date"]].copy()

for k, feature in enumerate(features):
    signal = signal_multi[:, :, k]

    n_timestamps = signal_multi.shape[1]

    wav =  wt.WaveletTransform(
        adj_matrix,
        n_timestamps,
        graph_product="strong",
        n_filters=32,
        kernel="abspline",
        scaling_function=False,
        method="chebyshev",
        order_chebyshev=30,
    )
    coeffs = wav.transform(signal)
    coeffs = sp.get_scaled_coefficients(coeffs)
    # create column for coeffs in the df_res
    for i in range(coeffs.shape[-1]):
        df_res[f"{feature}_coeff_{i}"] = coeffs[:, :, i].flatten()
    df_res = df_res.copy()
    df_res[df_res.columns[2:]] = df_res[df_res.columns[2:]].astype(np.float32)
    df_res.to_csv(f"data/coeffs/{feature}_{poly_division}_{time_interval}.csv", index=False)
    df_res = df[["id_poly", "date"]].copy()

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
  scaled_coefficients[:, i] = np.log(scaled_coefficients[:, i] + 1) / np.log(


In [4]:
poly_division = "SpCenterCensus5k"
time_interval = "Month"
df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
features = df.columns[2:]
adj_matrix = scipy.sparse.load_npz(f"data/adj_matrix/{poly_division}.npz").toarray()
# transform dataframe into a numpy array with "id_poly" as the first dimension and "date" as the second, with the rest of the columns as the third dimension
signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
df_res = df[["id_poly", "date"]].copy()

for k, feature in enumerate(features):
    signal = signal_multi[:, :, k]

    n_timestamps = signal_multi.shape[1]

    wav =  wt.WaveletTransform(
        adj_matrix,
        n_timestamps,
        graph_product="strong",
        n_filters=32,
        kernel="abspline",
        scaling_function=False,
        method="chebyshev",
        order_chebyshev=30,
    )
    coeffs = wav.transform(signal)
    coeffs = sp.get_scaled_coefficients(coeffs)
    # create column for coeffs in the df_res
    for i in range(coeffs.shape[-1]):
        df_res[f"{feature}_coeff_{i}"] = coeffs[:, :, i].flatten()
    df_res = df_res.copy()
    df_res[df_res.columns[2:]] = df_res[df_res.columns[2:]].astype(np.float32)
    df_res.to_csv(f"data/coeffs/{feature}_{poly_division}_{time_interval}.csv", index=False)
    df_res = df[["id_poly", "date"]].copy()


  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
  scaled_coefficients[:, i] = np.log(scaled_coefficients[:, i] + 1) / np.log(


In [5]:
poly_division = "SpCenterCensus10k"
time_interval = "Month"
df = pd.read_csv(f"data/polygon_data/{poly_division}_{time_interval}.csv")
features = df.columns[2:]
adj_matrix = scipy.sparse.load_npz(f"data/adj_matrix/{poly_division}.npz").toarray()
# transform dataframe into a numpy array with "id_poly" as the first dimension and "date" as the second, with the rest of the columns as the third dimension
signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
df_res = df[["id_poly", "date"]].copy()

for k, feature in enumerate(features):
    signal = signal_multi[:, :, k]
    print(feature, signal.min(), signal.mean(), signal.max())

    n_timestamps = signal_multi.shape[1]

    wav =  wt.WaveletTransform(
        adj_matrix,
        n_timestamps,
        graph_product="strong",
        n_filters=32,
        kernel="abspline",
        scaling_function=False,
        method="chebyshev",
        order_chebyshev=30,
    )
    coeffs = wav.transform(signal)
    coeffs = sp.get_scaled_coefficients(coeffs)
    # create column for coeffs in the df_res
    for i in range(coeffs.shape[-1]):
        df_res[f"{feature}_coeff_{i}"] = coeffs[:, :, i].flatten()
    df_res = df_res.copy()
    df_res[df_res.columns[2:]] = df_res[df_res.columns[2:]].astype(np.float32)
    df_res.to_csv(f"data/coeffs/{feature}_{poly_division}_{time_interval}.csv", index=False)
    df_res = df[["id_poly", "date"]].copy()

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)


WazeACCIDENT 0.0 0.49378560970667823 265.55482336217807
WazeHAZARD 0.0 0.0 0.0


  scaled_coefficients[:, i] = np.log(scaled_coefficients[:, i] + 1) / np.log(


WazeJAM 0.0 19.25316077316099 2631.8492545794334
WazeROADCLOSED 0.0 0.24126237374413284 1132.7395717650693
WazeWEATHERHAZARD 0.0 4.844267670127236 2520.3430561922164
FurtoCelular 0.0 1.734176990545231 1997.5697979730808
RouboCelular 0.0 1.257654187235618 5100.038004857221
temperature 15.58845897863108 20.418512996871296 24.429501333759912
precipitation 4.711483200245173 104.72131975036574 490.9125718429855


In [6]:
precompute_wavelet("SpGrid", "Month")

  signal_multi = np.stack(df.groupby("id_poly").apply(lambda x: x.sort_values("date").drop(columns=["id_poly", "date"]).values).values)
  scaled_coefficients[:, i] = np.log(scaled_coefficients[:, i] + 1) / np.log(
