In [180]:
# Importing dependencies
import csv
import numpy as np
import pandas as pd
from datetime import datetime,date

import warnings
warnings.filterwarnings('ignore')

In [160]:
# Initial exploration of dataset
rat_data = pd.read_csv('resources/Rat_Sightings.csv')

## 1. Initial exploration of dataset

In [161]:
rat_data.head()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
0,31464015,09/04/2015 12:00:00 AM,09/18/2015 12:00:00 AM,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Mixed Use Building,10006.0,,...,,,,,,,,40.707772,-74.012963,"(40.70777155363643, -74.01296309970473)"
1,31464024,09/04/2015 12:00:00 AM,10/28/2015 12:00:00 AM,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,Commercial Building,10306.0,2270 HYLAN BOULEVARD,...,,,,,,,,40.575209,-74.104547,"(40.575209242947444, -74.1045465185469)"
2,31464025,09/04/2015 12:00:00 AM,,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,1-2 Family Dwelling,10310.0,758 POST AVENUE,...,,,,,,,,40.631236,-74.126878,"(40.63123555151668, -74.12687759748677)"
3,31464026,09/04/2015 12:00:00 AM,09/14/2015 12:00:00 AM,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Apt. Building,11206.0,198 SCHOLES STREET,...,,,,,,,,40.708987,-73.941207,"(40.70898692345805, -73.94120690238431)"
4,31464027,09/04/2015 12:00:00 AM,09/22/2015 12:00:00 AM,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,3+ Family Mixed Use Building,10462.0,2138 WALLACE AVENUE,...,,,,,,,,40.85413,-73.864813,"(40.85413014360452, -73.86481331044513)"


In [162]:
# Finding range of dates for all incidents
max_date = rat_data['Created Date'].max()
min_date = rat_data['Created Date'].min()
print(f'Dataset starts from {min_date} to {max_date}')

Dataset starts from 01/01/2010 02:15:27 PM to 12/31/2016 12:00:00 AM


In [163]:
# Determing the number of rows and columns in dataset
row_col = rat_data.shape
print(f'Dataset contains {row_col[0]} rows and {row_col[1]} columns')

Dataset contains 101914 rows and 52 columns


In [164]:
# Determing labels for each column
rat_data.columns

Index(['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
       'Complaint Type', 'Descriptor', 'Location Type', 'Incident Zip',
       'Incident Address', 'Street Name', 'Cross Street 1', 'Cross Street 2',
       'Intersection Street 1', 'Intersection Street 2', 'Address Type',
       'City', 'Landmark', 'Facility Type', 'Status', 'Due Date',
       'Resolution Action Updated Date', 'Community Board', 'Borough',
       'X Coordinate (State Plane)', 'Y Coordinate (State Plane)',
       'Park Facility Name', 'Park Borough', 'School Name', 'School Number',
       'School Region', 'School Code', 'School Phone Number', 'School Address',
       'School City', 'School State', 'School Zip', 'School Not Found',
       'School or Citywide Complaint', 'Vehicle Type', 'Taxi Company Borough',
       'Taxi Pick Up Location', 'Bridge Highway Name',
       'Bridge Highway Direction', 'Road Ramp', 'Bridge Highway Segment',
       'Garage Lot Name', 'Ferry Direction', 'Ferry Termina

In [165]:
rat_data.describe()

Unnamed: 0,Unique Key,Incident Zip,Facility Type,X Coordinate (State Plane),Y Coordinate (State Plane),School or Citywide Complaint,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude
count,101914.0,101578.0,0.0,101208.0,101208.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101208.0,101208.0
mean,28158640.0,10728.680895,,1002473.0,208639.988953,,,,,,,,,,,,40.739304,-73.934206
std,6015376.0,631.206648,,19580.43,29956.193531,,,,,,,,,,,,0.082224,0.070617
min,11464390.0,83.0,,913495.0,121350.0,,,,,,,,,,,,40.499502,-74.254437
25%,23414520.0,10086.25,,993610.0,186934.0,,,,,,,,,,,,40.67974,-73.966216
50%,28836800.0,10472.0,,1001398.0,203425.5,,,,,,,,,,,,40.725009,-73.938112
75%,33460140.0,11222.0,,1011951.0,236364.0,,,,,,,,,,,,40.815428,-73.899963
max,37197000.0,100354.0,,1066922.0,271876.0,,,,,,,,,,,,40.912869,-73.701632


##### For our analysis we're interested in the following columns:
   ###### - "Unique Key": entry for each incident of rodent sighting
   ###### - "Created Date": date of each incident
   ###### - "Location Type": dwelling where incident occurs
   ###### - "'Incident Zip'": zip code of incident
   ###### - "'Borough'": borough where incident occurs
   ###### - "X Coordinate (State Plane)": x coordinate of state map of each incident
   ###### - "Y Coordinate (State Plane)": y coordinate of sate map of each incident
   ###### - "Latitude": latitude of each incident
   ###### - "Longitude": longitude of each incident

##### Also we'll remove all rows with missing or incorrectly formatted values

## 2. Data cleaning

In [166]:
# Removing time entry of each incident and keeping only the date
rat_data['Created Date'] = pd.to_datetime(rat_data['Created Date']).dt.date

# Removing unecessary columns
rat_data = rat_data.drop(['Closed Date', 
                         'Agency', 
                         'Incident Address', 
                         'Street Name', 
                         'Cross Street 1', 
                         'Cross Street 2', 
                         'Intersection Street 1', 
                         'Intersection Street 2', 
                         'Address Type', 
                         'Landmark', 
                         'Facility Type', 
                         'Status', 
                         'Due Date', 
                         'Resolution Action Updated Date', 
                         'Community Board', 
                         'Park Facility Name', 
                         'Park Borough', 
                         'School Name', 
                         'School Number', 
                         'School Region', 
                         'School Code', 
                         'School Phone Number', 
                         'School Address', 
                         'School City', 
                         'School State', 
                         'School Zip', 
                         'School Not Found', 
                         'School or Citywide Complaint', 
                         'Vehicle Type', 
                         'Taxi Company Borough', 
                         'Taxi Pick Up Location', 
                         'Bridge Highway Name',
                         'Bridge Highway Direction', 
                         'Road Ramp', 
                         'Bridge Highway Segment', 
                         'Garage Lot Name', 
                         'Ferry Direction', 
                         'Ferry Terminal Name',
                         'City', 
                         'Location',
                         'Agency Name'], 
                         axis=1)

# Renaming columns for clarity
rat_data.rename(columns={'Unique Key': 'Incident ID', 
                         'Created Date': 'Sighting Date', 
                         'Descriptor': 'Incident'}, inplace=True)

rat_data.head()

Unnamed: 0,Incident ID,Sighting Date,Complaint Type,Incident,Location Type,Incident Zip,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Latitude,Longitude
0,31464015,2015-09-04,Rodent,Rat Sighting,3+ Family Mixed Use Building,10006.0,MANHATTAN,980656.0,197137.0,40.707772,-74.012963
1,31464024,2015-09-04,Rodent,Rat Sighting,Commercial Building,10306.0,STATEN ISLAND,955207.0,148858.0,40.575209,-74.104547
2,31464025,2015-09-04,Rodent,Rat Sighting,1-2 Family Dwelling,10310.0,STATEN ISLAND,949033.0,169278.0,40.631236,-74.126878
3,31464026,2015-09-04,Rodent,Rat Sighting,3+ Family Apt. Building,11206.0,BROOKLYN,1000550.0,197585.0,40.708987,-73.941207
4,31464027,2015-09-04,Rodent,Rat Sighting,3+ Family Mixed Use Building,10462.0,BRONX,1021648.0,250489.0,40.85413,-73.864813


In [167]:
# Finding missing values in columns of interest

missing_date = rat_data['Sighting Date'].isna().sum()
missing_zip = rat_data['Incident Zip'].isna().sum()
missing_borough = rat_data['Borough'].isna().sum()
missing_latitude = rat_data['Latitude'].isna().sum()
missing_longitude = rat_data['Longitude'].isna().sum()

print('Missing values from the dataset')
print('-------------------------------')
print(f'Date: {missing_date}')
print(f'Zip: {missing_zip}')
print(f'Borough: {missing_borough}')
print(f'Latitude: {missing_latitude}')
print(f'Longitude: {missing_longitude}')

Missing values from the dataset
-------------------------------
Date: 0
Zip: 336
Borough: 0
Latitude: 706
Longitude: 706


In [168]:
# Since we're interested in location data, we'll drop rows with missing values
clean_rat_data = rat_data.dropna()

In [169]:
# Confirming dropping of rows with missing values
missing_date = clean_rat_data['Sighting Date'].isna().sum()
missing_zip = clean_rat_data['Incident Zip'].isna().sum()
missing_borough = clean_rat_data['Borough'].isna().sum()
missing_latitude = clean_rat_data['Latitude'].isna().sum()
missing_longitude = clean_rat_data['Longitude'].isna().sum()

print('Missing values from the dataset')
print('-------------------------------')
print(f'Date: {missing_date}')
print(f'Zip: {missing_zip}')
print(f'Borough: {missing_borough}')
print(f'Latitude: {missing_latitude}')
print(f'Longitude: {missing_longitude}')

Missing values from the dataset
-------------------------------
Date: 0
Zip: 0
Borough: 0
Latitude: 0
Longitude: 0


In [189]:
# Reformatting borough names
# Reformattiing zip column

clean_rat_data['Borough'] = clean_rat_data['Borough'].str.lower()
clean_rat_data['Borough'] = clean_rat_data['Borough'].str.title()

clean_rat_data['Incident Zip'] = clean_rat_data['Incident Zip'].round(0).astype(int)

clean_rat_data.head()

Unnamed: 0,Incident ID,Sighting Date,Complaint Type,Incident,Location Type,Incident Zip,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Latitude,Longitude
0,31464015,2015-09-04,Rodent,Rat Sighting,3+ Family Mixed Use Building,10006,Manhattan,980656.0,197137.0,40.707772,-74.012963
1,31464024,2015-09-04,Rodent,Rat Sighting,Commercial Building,10306,Staten Island,955207.0,148858.0,40.575209,-74.104547
2,31464025,2015-09-04,Rodent,Rat Sighting,1-2 Family Dwelling,10310,Staten Island,949033.0,169278.0,40.631236,-74.126878
3,31464026,2015-09-04,Rodent,Rat Sighting,3+ Family Apt. Building,11206,Brooklyn,1000550.0,197585.0,40.708987,-73.941207
4,31464027,2015-09-04,Rodent,Rat Sighting,3+ Family Mixed Use Building,10462,Bronx,1021648.0,250489.0,40.85413,-73.864813


In [190]:
# Analyzing cleaned dataset
clean_rat_data.describe()

Unnamed: 0,Incident ID,Incident Zip,X Coordinate (State Plane),Y Coordinate (State Plane),Latitude,Longitude
count,101186.0,101186.0,101186.0,101186.0,101186.0,101186.0
mean,28156080.0,10728.847123,1002474.0,208639.833851,40.739304,-73.934202
std,6013490.0,564.145932,19580.94,29955.586608,0.082223,0.070619
min,11464390.0,83.0,913495.0,121350.0,40.499502,-74.254437
25%,23415530.0,10128.0,993611.2,186936.0,40.679746,-73.966216
50%,28833440.0,10472.0,1001399.0,203424.0,40.724987,-73.938112
75%,33454030.0,11222.0,1011951.0,236364.0,40.815427,-73.899963
max,37197000.0,11694.0,1066922.0,271876.0,40.912869,-73.701632


In [191]:
# We can see that the zip column has one incorrectly formatted entry with a value of 83
# Since we don't know which zip code it refers to, we'll drop those rows
final_rat_data = clean_rat_data[clean_rat_data['Incident Zip'] != 83.0]

# Confirming removal of incorrectly formatted zip code entry
final_rat_data['Incident Zip'].describe()

count    101165.000000
mean      10731.057006
std         542.950742
min       10000.000000
25%       10128.000000
50%       10472.000000
75%       11222.000000
max       11694.000000
Name: Incident Zip, dtype: float64

In [192]:
max_date = final_rat_data['Sighting Date'].max()
min_date = final_rat_data['Sighting Date'].min()

row_col = final_rat_data.shape

# Description of cleaned dataset we'll use for our analysis

print('Cleaned NYC Rat Dataset')
print('-----------------------')
print(f'Table contains {row_col[0]} rows and {row_col[1]} columns')
print(f'Dates range from {min_date} to {max_date}')

Cleaned NYC Rat Dataset
-----------------------
Table contains 101165 rows and 11 columns
Dates range from 2010-01-01 to 2017-09-16


## 3. Data analysis