# DBRS Technical Assessment

## Import Pandas

In [3]:
import pandas as pd

## Task 1 

1) Load data(2017_SR_Subset) into a dataframe df1

2) Fetch 10 common 'complaint_types' and store as a Series

3) Filter dataframe rows only if value in 'complaint_type' column is present in series

4) Group dataframe rows by 'borough', and for each 'borough', count occurrence of 'top_10_complaint_type'

In [4]:
# Read csv dataset into a dataframe
df1 = pd.read_csv('2017_SR_subet_vs.csv', low_memory=False)

In [194]:
# Inspect dataframe
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2439273 entries, 0 to 2439272
Data columns (total 6 columns):
Unique Key        int64
Created Date      object
Complaint Type    object
Incident Zip      object
City              object
Borough           object
dtypes: int64(1), object(5)
memory usage: 111.7+ MB


In [5]:
# Rename dataframe column names
df1.rename(columns={'Complaint Type': 'complaint_type', 'Borough': 'borough', 'Incident Zip':'incident_zip'}, inplace=True)

In [145]:
# Get the 10 common complaint types
top_10_complaint_types = df1['complaint_type'].value_counts().head(10)

In [144]:
df1.head()

Unnamed: 0,Unique Key,Created Date,complaint_type,incident_zip,City,borough
0,35148177,01/01/2017 12:00:00 AM,Rodent,11435,JAMAICA,QUEENS
1,35148176,01/01/2017 12:00:00 AM,Rodent,10009,NEW YORK,MANHATTAN
2,35148085,01/01/2017 12:00:00 AM,Rodent,11418,RICHMOND HILL,QUEENS
3,35147948,01/01/2017 12:00:00 AM,Rodent,11694,Rockaway Park,QUEENS
4,35147881,01/01/2017 12:00:00 AM,Rodent,11210,BROOKLYN,BROOKLYN


In [149]:
top_10_complaint_types

Noise - Residential        229395
HEAT/HOT WATER             210750
Illegal Parking            145883
Blocked Driveway           135677
Street Condition            93231
Street Light Condition      84174
UNSANITARY CONDITION        79183
Noise - Street/Sidewalk     73053
Water System                64953
Noise                       60122
Name: complaint_type, dtype: int64

In [8]:
result1 = df1[df1.complaint_type.isin(top_10_complaint_types.index.tolist())][['borough', 'complaint_type']].groupby(['borough', 'complaint_type'])['complaint_type'].count()

In [265]:
result1

borough        complaint_type         
BRONX          Blocked Driveway           24480
               HEAT/HOT WATER             67828
               Illegal Parking            16099
               Noise                       3131
               Noise - Residential        57420
               Noise - Street/Sidewalk    14024
               Street Condition           11759
               Street Light Condition     18408
               UNSANITARY CONDITION       24530
               Water System               10193
BROOKLYN       Blocked Driveway           49154
               HEAT/HOT WATER             66079
               Illegal Parking            55306
               Noise                      15401
               Noise - Residential        67482
               Noise - Street/Sidewalk    21306
               Street Condition           25413
               Street Light Condition     22447
               UNSANITARY CONDITION       26636
               Water System               19769
M

## Task 2

1) Load '2010_population.csv' dataset into a dataframe as df2

2) Sort "Population" column in descending order and limit first 10 rows. Save into a Pandas Series 'top_10_populous'

3) Filter out df1 for rows with 'incident_zip' present in 'top_10_populous'

4) Group df2 by incident zip, and for each zip count 'complaint_type' 

In [9]:
df2 = pd.read_csv('2010_census.csv')

In [205]:
df2.head()

Unnamed: 0,Zip Code ZCTA,2010 Census Population
0,1001,16769
1,1002,29049
2,1003,10372
3,1005,5079
4,1007,14649


In [125]:
df1_list = df1['incident_zip'].tolist()
population_by_zip = df2.loc[df2['Zip Code ZCTA'].isin(df1_list)]
population_by_zip.head()

Unnamed: 0,Zip Code ZCTA,2010 Census Population
200,1583,7591
234,1757,28061
323,2062,28602
332,2109,3771
528,2767,13566


In [117]:
top_10_populous = population_by_zip.sort_values(by=['2010 Census Population'], ascending=False).head(10)
top_10_populous

Unnamed: 0,Zip Code ZCTA,2010 Census Population
2748,11368,109931
2720,11226,101572
2753,11373,100820
2714,11220,99598
2759,11385,98592
2527,10467,97060
2451,10025,94600
2702,11208,94469
2729,11236,93877
2701,11207,93386


In [150]:
df1_zip_list = df1['incident_zip'].tolist()
top_10_populous[top_10_populous['Zip Code ZCTA'].isin(df1_zip_list)]


Unnamed: 0,Zip Code ZCTA,2010 Census Population
2748,11368,109931
2720,11226,101572
2753,11373,100820
2714,11220,99598
2759,11385,98592
2527,10467,97060
2451,10025,94600
2702,11208,94469
2729,11236,93877
2701,11207,93386


## Task 3

1) Get total complaints for each borough

2) Get total population for each borough

3) Add a new column "diff" that stores difference between total complaints and total population for each borough

4) Sort dataframe on "diff" in descending order

In [156]:
# Total complaints
df3 = df1.dropna()
df3.groupby(['borough', 'complaint_type'])['complaint_type'].count().sum()

2336716

In [157]:
# Total complaints per borough
total_complaints = df3.groupby('borough')['complaint_type'].count()


In [158]:
# Validate that df3 has all borough's
total_complaints

borough
BRONX            431366
BROOKLYN         744327
MANHATTAN        460975
QUEENS           570341
STATEN ISLAND    125304
Unspecified        4403
Name: complaint_type, dtype: int64

In [159]:
# Create new dataframe consisting of borough and corresponding total complaint count
df4 = pd.DataFrame({'borough':total_complaints.index, 'total_complaints':total_complaints.values})

In [160]:
df4

Unnamed: 0,borough,total_complaints
0,BRONX,431366
1,BROOKLYN,744327
2,MANHATTAN,460975
3,QUEENS,570341
4,STATEN ISLAND,125304
5,Unspecified,4403


In [161]:
# Add new column for population
# Dummy value
df4['population'] = 0

In [162]:
df4

Unnamed: 0,borough,total_complaints,population
0,BRONX,431366,0
1,BROOKLYN,744327,0
2,MANHATTAN,460975,0
3,QUEENS,570341,0
4,STATEN ISLAND,125304,0
5,Unspecified,4403,0


In [163]:
# Calculate difference between total complaints and population for each borough. Add the difference in a new column
df4['diff'] = df4['total_complaints'] - df4['population']

In [164]:
# Sort by difference
df4.sort_values(by=['diff'], ascending=False)

Unnamed: 0,borough,total_complaints,population,diff
1,BROOKLYN,744327,0,744327
3,QUEENS,570341,0,570341
2,MANHATTAN,460975,0,460975
0,BRONX,431366,0,431366
4,STATEN ISLAND,125304,0,125304
5,Unspecified,4403,0,4403
