In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, chisquare
from scipy.stats.contingency import expected_freq
from scipy.stats import kstest
from scipy.stats import nbinom
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import geopandas as gpd
import folium
from shapely.geometry import Point
import itertools
from google.colab import drive
drive.mount('/content/drive')
TOM_FILES_DIR = "/content/drive/My Drive/Colab Notebooks/data science- transportation research/"
MAX_FILES_DIR = "/content/drive/My Drive/data science- transportation research/"
FILES_DIR = TOM_FILES_DIR

Mounted at /content/drive


In [None]:
chunk_size = 3000
chunks = []

for i, chunk in enumerate(pd.read_csv(f"{FILES_DIR}final_df.csv", chunksize=chunk_size)):
    chunks.append(chunk)
    if i%1000==0:
      print(f"iteration number {i}")

df = pd.concat(chunks)
df["IS_SIGNED"] = df["IS_SIGNED"].replace({'1': 1, '0': 0, 'NO': 0}).astype(np.int8)

iteration number 0
iteration number 1000
iteration number 2000
iteration number 3000
iteration number 4000
iteration number 5000


In [None]:
small_df = df.drop(columns=["Unnamed: 0","the_geom"])

In [None]:
del df
del chunks

In [None]:
STREET_FILE_NAME = "NYC Street Centerline (CSCL).geojson"
streets_df = gpd.read_file(f"{FILES_DIR}{STREET_FILE_NAME}")

In [None]:
centroid = streets_df["geometry"].centroid
loc_data = pd.DataFrame({"lat":centroid.x, "long":centroid.y, "borough":streets_df["borocode"]})
loc_data = loc_data.reset_index(names="ST_INDEX")

In [None]:
del centroid
del streets_df

In [None]:
final_df = pd.merge(small_df, loc_data, on='ST_INDEX')

In [None]:
final_df

Unnamed: 0,ST_INDEX,MONTH,YEAR,COLLISIONS,Residential,Education,Cultural,Recreational,Social Services,Transportation,...,PathTrail,StepStreet,Driveway,Ramp,Alley,U-Turn,FerryRoute,lat,long,borough
0,0,1,2012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.017773,40.706518,1
1,0,2,2012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.017773,40.706518,1
2,0,3,2012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.017773,40.706518,1
3,0,4,2012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.017773,40.706518,1
4,0,5,2012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.017773,40.706518,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16229405,121114,10,2022,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.254846,40.507997,5
16229406,121114,11,2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.254846,40.507997,5
16229407,121114,12,2022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.254846,40.507997,5
16229408,121114,1,2023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,-74.254846,40.507997,5


In [None]:
BOROUGH_MAPPING = {
    1: "Manhattan",
    2: "Bronx",
    3: "Brooklyn",
    4: "Queens",
    5: "Staten Island",
}
one_hot_df = pd.get_dummies(final_df["borough"])
one_hot_df.columns = BOROUGH_MAPPING.values()
final_df = pd.concat([final_df, one_hot_df], axis=1)
final_df = final_df.drop(columns="borough")

In [None]:
final_df.to_csv(f"{FILES_DIR}data_to_models.csv")