# 02 EDA

In [235]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression

import warnings
warnings.simplefilter(action="ignore")

In [236]:
rats = pd.read_csv('../datasets/rats.csv')
rat_sightings = pd.read_csv('../datasets/rat_sightings.csv')

In [237]:
zip_pop_yr = pd.read_csv('../datasets/zip_pop_yr.csv')
zip_res_yr = pd.read_csv('../datasets/zip_res_yr.csv')

In [238]:
rats['inspection_date'] = pd.to_datetime(rats['inspection_date'])

In [239]:
rats.head()

Unnamed: 0,inspection_type,zip_code,inspection_date
0,INITIAL,10065,2018-12-31 14:00:53
1,INITIAL,10065,2018-12-31 13:57:24
2,INITIAL,10021,2018-12-31 13:56:06
3,INITIAL,10065,2018-12-31 13:54:49
4,INITIAL,10022,2018-12-31 13:53:20


In [240]:
rats.shape

(510172, 3)

In [241]:
rats.dtypes

inspection_type            object
zip_code                    int64
inspection_date    datetime64[ns]
dtype: object

In [242]:
rats.sort_values(by = 'inspection_date', ascending = True, inplace = True)

In [243]:
# Extract the year and month from the date column:
rats['year'] = pd.DatetimeIndex(rats['inspection_date']).year
rats['month'] = pd.DatetimeIndex(rats['inspection_date']).month
rats.head()

Unnamed: 0,inspection_type,zip_code,inspection_date,year,month
510171,BAIT,10027,2010-01-04 09:25:14,2010,1
510170,BAIT,10031,2010-01-04 10:25:59,2010,1
510169,BAIT,10032,2010-01-04 11:05:30,2010,1
510168,BAIT,10032,2010-01-04 11:45:59,2010,1
510167,INITIAL,10034,2010-01-04 12:56:03,2010,1


In [244]:
rats['year_month'] = pd.to_datetime(rats['inspection_date']).dt.to_period('M')
rats.head()

Unnamed: 0,inspection_type,zip_code,inspection_date,year,month,year_month
510171,BAIT,10027,2010-01-04 09:25:14,2010,1,2010-01
510170,BAIT,10031,2010-01-04 10:25:59,2010,1,2010-01
510169,BAIT,10032,2010-01-04 11:05:30,2010,1,2010-01
510168,BAIT,10032,2010-01-04 11:45:59,2010,1,2010-01
510167,INITIAL,10034,2010-01-04 12:56:03,2010,1,2010-01


In [245]:
rats = pd.concat([rats, pd.get_dummies(rats['inspection_type'])], axis = 1)
rats.head()

Unnamed: 0,inspection_type,zip_code,inspection_date,year,month,year_month,BAIT,CLEAN_UPS,COMPLIANCE,INITIAL
510171,BAIT,10027,2010-01-04 09:25:14,2010,1,2010-01,1,0,0,0
510170,BAIT,10031,2010-01-04 10:25:59,2010,1,2010-01,1,0,0,0
510169,BAIT,10032,2010-01-04 11:05:30,2010,1,2010-01,1,0,0,0
510168,BAIT,10032,2010-01-04 11:45:59,2010,1,2010-01,1,0,0,0
510167,INITIAL,10034,2010-01-04 12:56:03,2010,1,2010-01,0,0,0,1


In [246]:
rat_sightings['created_date'] = pd.to_datetime(rat_sightings['created_date'])
rat_sightings['year'] = pd.DatetimeIndex(rat_sightings['created_date']).year
rat_sightings['month'] = pd.DatetimeIndex(rat_sightings['created_date']).month
rat_sightings['year_month'] = pd.to_datetime(rat_sightings['created_date']).dt.to_period('M')
rat_sightings.head()

Unnamed: 0,incident_zip,created_date,year,month,year_month
0,10023,2020-03-01 00:32:41,2020,3,2020-03
1,10022,2020-02-29 18:54:40,2020,2,2020-02
2,10024,2020-02-29 16:32:06,2020,2,2020-02
3,10016,2020-02-29 14:31:04,2020,2,2020-02
4,10026,2020-02-29 12:10:43,2020,2,2020-02


In [247]:
rat_sightings.head()

Unnamed: 0,incident_zip,created_date,year,month,year_month
0,10023,2020-03-01 00:32:41,2020,3,2020-03
1,10022,2020-02-29 18:54:40,2020,2,2020-02
2,10024,2020-02-29 16:32:06,2020,2,2020-02
3,10016,2020-02-29 14:31:04,2020,2,2020-02
4,10026,2020-02-29 12:10:43,2020,2,2020-02


In [248]:
rat_sightings['sighting'] = 1
sightings = rat_sightings.groupby(by = ['incident_zip', 'year_month'])[['sighting']].sum().reset_index()
sightings.head()

Unnamed: 0,incident_zip,year_month,sighting
0,10001,2010-01,3
1,10001,2010-03,1
2,10001,2010-05,2
3,10001,2010-07,3
4,10001,2010-09,3


In [249]:
sightings['zip_year_month'] = sightings['incident_zip'].astype(str)+' '+sightings['year_month'].astype(str)
sightings.head()

Unnamed: 0,incident_zip,year_month,sighting,zip_year_month
0,10001,2010-01,3,10001 2010-01
1,10001,2010-03,1,10001 2010-03
2,10001,2010-05,2,10001 2010-05
3,10001,2010-07,3,10001 2010-07
4,10001,2010-09,3,10001 2010-09


In [250]:
rats_year_month = rats.groupby(by = ['zip_code', 'year_month'])[['BAIT', 'CLEAN_UPS', 'COMPLIANCE', 'INITIAL']].sum().reset_index()
rats_year_month.head()

Unnamed: 0,zip_code,year_month,BAIT,CLEAN_UPS,COMPLIANCE,INITIAL
0,10001,2010-01,0.0,0.0,15.0,3.0
1,10001,2010-02,1.0,0.0,0.0,3.0
2,10001,2010-03,1.0,0.0,4.0,5.0
3,10001,2010-04,0.0,0.0,2.0,1.0
4,10001,2010-05,3.0,0.0,2.0,62.0


In [251]:
rats_year_month['zip_year_month'] = rats_year_month['zip_code'].astype(str)+' '+rats_year_month['year_month'].astype(str)
rats_year_month.head()

Unnamed: 0,zip_code,year_month,BAIT,CLEAN_UPS,COMPLIANCE,INITIAL,zip_year_month
0,10001,2010-01,0.0,0.0,15.0,3.0,10001 2010-01
1,10001,2010-02,1.0,0.0,0.0,3.0,10001 2010-02
2,10001,2010-03,1.0,0.0,4.0,5.0,10001 2010-03
3,10001,2010-04,0.0,0.0,2.0,1.0,10001 2010-04
4,10001,2010-05,3.0,0.0,2.0,62.0,10001 2010-05


In [252]:
zip_year_month = pd.merge(rats_year_month, 
                          sightings, 
                          left_on=rats_year_month['zip_year_month'], 
                          right_on=sightings['zip_year_month'], 
                          how='left')
zip_year_month.head()

Unnamed: 0,key_0,zip_code,year_month_x,BAIT,CLEAN_UPS,COMPLIANCE,INITIAL,zip_year_month_x,incident_zip,year_month_y,sighting,zip_year_month_y
0,10001 2010-01,10001,2010-01,0.0,0.0,15.0,3.0,10001 2010-01,10001.0,2010-01,3.0,10001 2010-01
1,10001 2010-02,10001,2010-02,1.0,0.0,0.0,3.0,10001 2010-02,,NaT,,
2,10001 2010-03,10001,2010-03,1.0,0.0,4.0,5.0,10001 2010-03,10001.0,2010-03,1.0,10001 2010-03
3,10001 2010-04,10001,2010-04,0.0,0.0,2.0,1.0,10001 2010-04,,NaT,,
4,10001 2010-05,10001,2010-05,3.0,0.0,2.0,62.0,10001 2010-05,10001.0,2010-05,2.0,10001 2010-05


In [253]:
zip_year_month = zip_year_month[['zip_code', 'year_month_x', 'BAIT', 'CLEAN_UPS', 'COMPLIANCE', 'INITIAL', 'sighting', 'zip_year_month_x']]
zip_year_month.columns = zip_year_month.columns.str.lower()
zip_year_month.rename(columns={'year_month_x': 'year_month', 'zip_year_month_x': 'zip_year_month'}, inplace=True)
zip_year_month = zip_year_month.fillna(0)
zip_year_month.head()

Unnamed: 0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,zip_year_month
0,10001,2010-01,0.0,0.0,15.0,3.0,3.0,10001 2010-01
1,10001,2010-02,1.0,0.0,0.0,3.0,0.0,10001 2010-02
2,10001,2010-03,1.0,0.0,4.0,5.0,1.0,10001 2010-03
3,10001,2010-04,0.0,0.0,2.0,1.0,0.0,10001 2010-04
4,10001,2010-05,3.0,0.0,2.0,62.0,2.0,10001 2010-05


In [254]:
zip_year_month.shape

(4605, 8)

In [255]:
zip_year_month.to_csv('../datasets/zip_year_month.csv', index=False)

In [256]:
zip_year_month['zip_year'] = zip_year_month['zip_year_month'].apply(lambda x: x[:10])
zip_year_month.head()

Unnamed: 0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,zip_year_month,zip_year
0,10001,2010-01,0.0,0.0,15.0,3.0,3.0,10001 2010-01,10001 2010
1,10001,2010-02,1.0,0.0,0.0,3.0,0.0,10001 2010-02,10001 2010
2,10001,2010-03,1.0,0.0,4.0,5.0,1.0,10001 2010-03,10001 2010
3,10001,2010-04,0.0,0.0,2.0,1.0,0.0,10001 2010-04,10001 2010
4,10001,2010-05,3.0,0.0,2.0,62.0,2.0,10001 2010-05,10001 2010


In [257]:
exogenous = pd.merge(zip_year_month, 
                     zip_res_yr, 
                     left_on=zip_year_month['zip_year'], 
                     right_on=zip_res_yr['zip_year'], 
                     how='left')
exogenous.head()

Unnamed: 0,key_0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,zip_year_month,zip_year_x,zip_year_y,count
0,10001 2010,10001,2010-01,0.0,0.0,15.0,3.0,3.0,10001 2010-01,10001 2010,10001 2010,410.0
1,10001 2010,10001,2010-02,1.0,0.0,0.0,3.0,0.0,10001 2010-02,10001 2010,10001 2010,410.0
2,10001 2010,10001,2010-03,1.0,0.0,4.0,5.0,1.0,10001 2010-03,10001 2010,10001 2010,410.0
3,10001 2010,10001,2010-04,0.0,0.0,2.0,1.0,0.0,10001 2010-04,10001 2010,10001 2010,410.0
4,10001 2010,10001,2010-05,3.0,0.0,2.0,62.0,2.0,10001 2010-05,10001 2010,10001 2010,410.0


In [258]:
exogenous = exogenous[['zip_code', 'year_month', 'bait', 'clean_ups', 'compliance', 'initial', 
                       'sighting', 'zip_year_x', 'count']]

In [259]:
zip_pop_yr['zip_year'] = zip_pop_yr['zipcode'].astype(str) + ' ' + zip_pop_yr['year'].astype(str)
zip_pop_yr.head()

Unnamed: 0,zipcode,population,year,zip_year
0,10278,0,2010,10278 2010
1,10280,7853,2010,10280 2010
2,10282,4783,2010,10282 2010
3,10463,65665,2010,10463 2010
4,10001,18949,2010,10001 2010


In [260]:
exogenous = pd.merge(exogenous, 
                     zip_pop_yr, 
                     left_on=exogenous['zip_year_x'], 
                     right_on=zip_pop_yr['zip_year'], 
                     how='left')
exogenous.head()

Unnamed: 0,key_0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,zip_year_x,count,zipcode,population,year,zip_year
0,10001 2010,10001,2010-01,0.0,0.0,15.0,3.0,3.0,10001 2010,410.0,10001.0,18949.0,2010.0,10001 2010
1,10001 2010,10001,2010-02,1.0,0.0,0.0,3.0,0.0,10001 2010,410.0,10001.0,18949.0,2010.0,10001 2010
2,10001 2010,10001,2010-03,1.0,0.0,4.0,5.0,1.0,10001 2010,410.0,10001.0,18949.0,2010.0,10001 2010
3,10001 2010,10001,2010-04,0.0,0.0,2.0,1.0,0.0,10001 2010,410.0,10001.0,18949.0,2010.0,10001 2010
4,10001 2010,10001,2010-05,3.0,0.0,2.0,62.0,2.0,10001 2010,410.0,10001.0,18949.0,2010.0,10001 2010


In [261]:
exogenous = exogenous[['zip_code', 'year_month', 'bait', 'clean_ups', 'compliance', 'initial', 
                       'sighting', 'count', 'population']]
exogenous.head()

Unnamed: 0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,count,population
0,10001,2010-01,0.0,0.0,15.0,3.0,3.0,410.0,18949.0
1,10001,2010-02,1.0,0.0,0.0,3.0,0.0,410.0,18949.0
2,10001,2010-03,1.0,0.0,4.0,5.0,1.0,410.0,18949.0
3,10001,2010-04,0.0,0.0,2.0,1.0,0.0,410.0,18949.0
4,10001,2010-05,3.0,0.0,2.0,62.0,2.0,410.0,18949.0


In [262]:
exogenous.rename(columns={'count': 'restaurant'}, inplace=True)

In [263]:
exogenous = exogenous.fillna(0)

In [264]:
exogenous.head()

Unnamed: 0,zip_code,year_month,bait,clean_ups,compliance,initial,sighting,restaurant,population
0,10001,2010-01,0.0,0.0,15.0,3.0,3.0,410.0,18949.0
1,10001,2010-02,1.0,0.0,0.0,3.0,0.0,410.0,18949.0
2,10001,2010-03,1.0,0.0,4.0,5.0,1.0,410.0,18949.0
3,10001,2010-04,0.0,0.0,2.0,1.0,0.0,410.0,18949.0
4,10001,2010-05,3.0,0.0,2.0,62.0,2.0,410.0,18949.0


In [265]:
exogenous.to_csv('../datasets/exogenous.csv', index=False)