# NYPD-Complaint-Data to DB

<span style="color: red;">**WARNING:** Only execute this notebook if your machine has sufficient memory (RAM >16 GB preferred)</span>

### Loading raw data

* [NYPD Complaint Data Historic](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i/about_data) is downloaded as a `.csv` locally and stored in `data` directory.

In [1]:
import sys
import os

sys.path.append(f"..{os.path.sep}")

data_dir = 'data'
fname = 'NYPD_Complaint_Data_Historic_20241027.csv'
fpath = os.path.join(sys.path[-1], data_dir, fname)

assert os.path.exists(fpath), f'{os.path.abspath(fpath)} does not exists!'

In [2]:
import polars as pl

complaint_lf = pl.scan_csv(fpath, try_parse_dates=True)
complaint_lf.collect_schema()

Schema([('CMPLNT_NUM', String),
        ('CMPLNT_FR_DT', String),
        ('CMPLNT_FR_TM', String),
        ('CMPLNT_TO_DT', String),
        ('CMPLNT_TO_TM', String),
        ('ADDR_PCT_CD', Int64),
        ('RPT_DT', String),
        ('KY_CD', Int64),
        ('OFNS_DESC', String),
        ('PD_CD', Int64),
        ('PD_DESC', String),
        ('CRM_ATPT_CPTD_CD', String),
        ('LAW_CAT_CD', String),
        ('BORO_NM', String),
        ('LOC_OF_OCCUR_DESC', String),
        ('PREM_TYP_DESC', String),
        ('JURIS_DESC', String),
        ('JURISDICTION_CODE', Int64),
        ('PARKS_NM', String),
        ('HADEVELOPT', String),
        ('HOUSING_PSA', String),
        ('X_COORD_CD', Int64),
        ('Y_COORD_CD', Int64),
        ('SUSP_AGE_GROUP', String),
        ('SUSP_RACE', String),
        ('SUSP_SEX', String),
        ('TRANSIT_DISTRICT', String),
        ('Latitude', Float64),
        ('Longitude', Float64),
        ('Lat_Lon', String),
        ('PATROL_BORO', String),


### Handling datetime variables

In [3]:
complaint_lf = complaint_lf.with_columns(pl.col('CMPLNT_FR_DT').str.to_date("%m/%d/%Y"),
                                         pl.col('CMPLNT_TO_DT').str.to_date("%m/%d/%Y"),
                                         pl.when(pl.col("CMPLNT_TO_TM").str.contains('(null)'))
                                         .then(None).otherwise(pl.col('CMPLNT_TO_TM')).str.to_time("%H:%M:%S").name.keep())

complaint_lf = complaint_lf.with_columns(pl.col('CMPLNT_FR_DT').dt.combine(pl.col('CMPLNT_FR_TM')).alias('CMPLNT_FR_DT_TM'),
                                         pl.col('CMPLNT_TO_DT').dt.combine(pl.col('CMPLNT_TO_TM')).alias('CMPLNT_TO_DT_TM')
                                         ).drop('CMPLNT_FR_DT','CMPLNT_FR_TM','CMPLNT_TO_DT','CMPLNT_TO_TM')

### Handling duplicate values in `CMPLNT_NUM`

In [4]:
complaint_lf.select(pl.len()).collect().item()

8914838

In [5]:
complaint_lf = complaint_lf.unique('CMPLNT_NUM')
complaint_lf.select(pl.len()).collect().item()

8913734

### Handling Latitude and Longitude variables

In [6]:
unique_lat_lon = complaint_lf.select(pl.col('Lat_Lon').unique(),
                                     pl.col('Lat_Lon').unique()
                                     .str.strip_chars('()')
                                     .str.split_exact(",",1)
                                     .struct.rename_fields(['lat','lon'])
                                     .alias("fields")).unnest("fields").with_columns(
                                         pl.col('lat').str.strip_chars(' ').cast(pl.Decimal),
                                         pl.col('lon').str.strip_chars(' ').cast(pl.Decimal)
                                         )

In [7]:
unique_lat_lon_df = unique_lat_lon.collect().to_pandas()
unique_lat_lon_df.head()

Unnamed: 0,Lat_Lon,lat,lon
0,"(40.60851339, -74.12501925)",40.60851339,-74.12501925
1,"(40.775588, -73.947373)",40.775588,-73.947373
2,"(40.601699, -74.072624)",40.601699,-74.072624
3,"(40.80860352, -73.90976381)",40.80860352,-73.90976381
4,"(40.78372249, -73.80862838)",40.78372249,-73.80862838


### Loading NYC Zip Code Data

* [Modified Zip Code Tabulation Areas (MODZCTA)](https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk/about_data) is downloaded as `.geojson` locally and stored in `data` directory.

In [8]:
import geopandas

unique_lat_lon_gdf = geopandas.GeoDataFrame(unique_lat_lon_df,
                             geometry=geopandas.points_from_xy(unique_lat_lon_df.lon, unique_lat_lon_df.lat),
                             crs="EPSG:4326")

fname = 'MODZCTA.geojson'
fpath = os.path.join(sys.path[-1], data_dir, fname)

assert os.path.exists(fpath), f'{os.path.abspath(fpath)} does not exists!'
geo_df = geopandas.read_file(fpath)

Extracting zip code

In [9]:
import numpy as np

zips = np.empty(unique_lat_lon_gdf.shape[0], dtype=object)
for i, geom in enumerate(geo_df.geometry):
    zips[unique_lat_lon_gdf.within(geom)] = geo_df.modzcta[i]
zips[zips==None] = ''
unique_lat_lon_gdf['zipcode'] = zips

In [10]:
lat_lon_zip_lf = pl.from_pandas(unique_lat_lon_gdf[['Lat_Lon','zipcode']]).lazy()
lat_lon_zip_lf.collect_schema()

Schema([('Lat_Lon', String), ('zipcode', String)])

Joining the Complait data with extracted zip code data

In [11]:
complaint_lf = complaint_lf.join(lat_lon_zip_lf, on='Lat_Lon')
complaint_lf.collect_schema()

Schema([('CMPLNT_NUM', String),
        ('ADDR_PCT_CD', Int64),
        ('RPT_DT', String),
        ('KY_CD', Int64),
        ('OFNS_DESC', String),
        ('PD_CD', Int64),
        ('PD_DESC', String),
        ('CRM_ATPT_CPTD_CD', String),
        ('LAW_CAT_CD', String),
        ('BORO_NM', String),
        ('LOC_OF_OCCUR_DESC', String),
        ('PREM_TYP_DESC', String),
        ('JURIS_DESC', String),
        ('JURISDICTION_CODE', Int64),
        ('PARKS_NM', String),
        ('HADEVELOPT', String),
        ('HOUSING_PSA', String),
        ('X_COORD_CD', Int64),
        ('Y_COORD_CD', Int64),
        ('SUSP_AGE_GROUP', String),
        ('SUSP_RACE', String),
        ('SUSP_SEX', String),
        ('TRANSIT_DISTRICT', String),
        ('Latitude', Float64),
        ('Longitude', Float64),
        ('Lat_Lon', String),
        ('PATROL_BORO', String),
        ('STATION_NAME', String),
        ('VIC_AGE_GROUP', String),
        ('VIC_RACE', String),
        ('VIC_SEX', String),
        

### Cleaned data

Implementing `.collect()` method on the entire lazyFrame to execute all calculations and joins in sequence.

In [12]:
print(complaint_lf.explain(format='tree'))

              0                                                 1                                                                    2                                                        3                                                      4                                                             5                                                                                                                                                                                  6                                                                                                                                                          7                         8
   ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [13]:
complaint_lf.collect()

CMPLNT_NUM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,CMPLNT_FR_DT_TM,CMPLNT_TO_DT_TM,zipcode
str,i64,str,i64,str,i64,str,str,str,str,str,str,str,i64,str,str,str,i64,i64,str,str,str,str,f64,f64,str,str,str,str,str,str,datetime[μs],datetime[μs],str
"""73828158""",112,"""07/24/2010""",351,"""CRIMINAL MISCHIEF & RELATED OF""",254,"""MISCHIEF, CRIMINAL 4, OF MOTOR""","""COMPLETED""","""MISDEMEANOR""","""QUEENS""","""FRONT OF""","""STREET""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",1025005,203505,"""(null)""","""(null)""","""(null)""",,40.725159,-73.852961,"""(40.725159, -73.852961)""","""PATROL BORO QUEENS NORTH""","""(null)""","""25-44""","""WHITE""","""F""",2010-07-23 21:00:00,2010-07-24 18:00:00,"""11375"""
"""60765391""",84,"""04/14/2009""",351,"""CRIMINAL MISCHIEF & RELATED OF""",254,"""MISCHIEF, CRIMINAL 4, OF MOTOR""","""COMPLETED""","""MISDEMEANOR""","""BROOKLYN""","""FRONT OF""","""RESIDENCE-HOUSE""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",988161,188234,"""(null)""","""(null)""","""(null)""",,40.683337,-73.985897,"""(40.683337, -73.985897)""","""PATROL BORO BKLYN NORTH""","""(null)""","""25-44""","""BLACK HISPANIC""","""M""",2009-04-14 03:30:00,2009-04-14 03:45:00,"""11217"""
"""78699913""",25,"""05/11/2011""",344,"""ASSAULT 3 & RELATED OFFENSES""",113,"""MENACING,UNCLASSIFIED""","""COMPLETED""","""MISDEMEANOR""","""MANHATTAN""","""INSIDE""","""RESIDENCE - PUBLIC HOUSING""","""N.Y. HOUSING POLICE""",2,"""(null)""","""(null)""","""670""",1002844,230571,"""(null)""","""WHITE""","""M""",,40.799522,-73.932838,"""(40.799522, -73.932838)""","""PATROL BORO MAN NORTH""","""(null)""","""<18""","""WHITE HISPANIC""","""F""",2011-05-11 16:00:00,2011-05-11 16:05:00,"""10035"""
"""78992198""",73,"""05/27/2011""",106,"""FELONY ASSAULT""",109,"""ASSAULT 2,1,UNCLASSIFIED""","""COMPLETED""","""FELONY""","""BROOKLYN""","""INSIDE""","""PUBLIC BUILDING""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",1011219,183704,"""18-24""","""BLACK""","""M""",,40.670863,-73.902779,"""(40.670863, -73.902779)""","""PATROL BORO BKLYN NORTH""","""(null)""","""45-64""","""WHITE""","""M""",2011-05-27 15:00:00,2011-05-27 15:10:00,"""11212"""
"""80393901""",75,"""08/20/2011""",125,"""NYS LAWS-UNCLASSIFIED FELONY""",847,"""NY STATE LAWS,UNCLASSIFIED FEL""","""COMPLETED""","""FELONY""","""BROOKLYN""","""(null)""","""STREET""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",1020115,186383,"""(null)""","""(null)""","""(null)""",,40.678182,-73.870697,"""(40.67818234, -73.87069686)""","""PATROL BORO BKLYN NORTH""","""(null)""","""(null)""","""UNKNOWN""","""E""",2011-08-20 19:40:00,2011-08-20 19:49:00,"""11208"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""191464017""",23,"""12/23/2018""",341,"""PETIT LARCENY""",333,"""LARCENY,PETIT FROM STORE-SHOPL""","""COMPLETED""","""MISDEMEANOR""","""MANHATTAN""","""INSIDE""","""DRUG STORE""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",999501,227548,"""45-64""","""WHITE HISPANIC""","""M""",,40.791232,-73.944922,"""(40.791232, -73.944922)""","""PATROL BORO MAN NORTH""","""(null)""","""UNKNOWN""","""UNKNOWN""","""D""",2018-12-23 11:10:00,2018-12-23 11:22:00,"""10029"""
"""78501505""",14,"""04/29/2011""",361,"""OFF. AGNST PUB ORD SENSBLTY &""",639,"""AGGRAVATED HARASSMENT 2""","""COMPLETED""","""MISDEMEANOR""","""MANHATTAN""","""INSIDE""","""COMMERCIAL BUILDING""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",987694,213173,"""45-64""","""WHITE""","""F""",,40.751789,-73.987568,"""(40.751789, -73.987568)""","""PATROL BORO MAN SOUTH""","""(null)""","""45-64""","""WHITE""","""M""",2011-02-01 12:00:00,2011-04-19 16:42:00,"""10018"""
"""75523759""",114,"""11/11/2010""",351,"""CRIMINAL MISCHIEF & RELATED OF""",254,"""MISCHIEF, CRIMINAL 4, OF MOTOR""","""COMPLETED""","""MISDEMEANOR""","""QUEENS""","""OPPOSITE OF""","""STREET""","""N.Y. POLICE DEPT""",0,"""(null)""","""(null)""","""(null)""",1001479,214744,"""(null)""","""(null)""","""(null)""",,40.756084,-73.937811,"""(40.756084, -73.937811)""","""PATROL BORO QUEENS NORTH""","""(null)""","""(null)""","""UNKNOWN""","""D""",2010-11-10 23:55:00,2010-11-11 00:05:00,"""11101"""
"""94308292""",43,"""01/04/2014""",361,"""OFF. AGNST PUB ORD SENSBLTY &""",639,"""AGGRAVATED HARASSMENT 2""","""COMPLETED""","""MISDEMEANOR""","""BRONX""","""INSIDE""","""RESIDENCE - PUBLIC HOUSING""","""N.Y. HOUSING POLICE""",2,"""(null)""","""(null)""","""911""",1021514,239266,"""(null)""","""(null)""","""(null)""",,40.823327,-73.865357,"""(40.823327, -73.865357)""","""PATROL BORO BRONX""","""(null)""","""18-24""","""BLACK""","""F""",2014-01-04 08:45:00,,"""10473"""


### Exporting to `.csv`