In [1]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv("Data/vehicle_collisions.csv")

In [2]:
# Dropping columns that are not required in this part
df.drop(df.columns[24:29], axis=1, inplace=True)
df.drop(df.columns[4:19], axis=1, inplace=True)
df.drop(df.columns[0:3], axis=1, inplace=True)
df.head()

Unnamed: 0,BOROUGH,VEHICLE 1 TYPE,VEHICLE 2 TYPE,VEHICLE 3 TYPE,VEHICLE 4 TYPE,VEHICLE 5 TYPE
0,QUEENS,SPORT UTILITY/STATION WAGON,,,,
1,,PASSENGER VEHICLE,,,,
2,BROOKLYN,PASSENGER VEHICLE,UNKNOWN,,,
3,BROOKLYN,BUS,PASSENGER VEHICLE,,,
4,,UNKNOWN,PASSENGER VEHICLE,,,


In [3]:
#Filling all NaN values with 0 and anything else with 1
# This will help in collating results to a single column Vehicles_Involved
df = df.fillna({'VEHICLE 1 TYPE': 0, 'VEHICLE 2 TYPE': 0, 'VEHICLE 3 TYPE': 0, 'VEHICLE 4 TYPE': 0, 
                'VEHICLE 5 TYPE': 0, 'BOROUGH' : 'UNKNOWN'})
df['VEHICLE 1 TYPE'] = df['VEHICLE 1 TYPE'].apply(lambda x: 1 if x != 0 else 0)
df['VEHICLE 2 TYPE'] = df['VEHICLE 2 TYPE'].apply(lambda x: 1 if x != 0 else 0)
df['VEHICLE 3 TYPE'] = df['VEHICLE 3 TYPE'].apply(lambda x: 1 if x != 0 else 0)
df['VEHICLE 4 TYPE'] = df['VEHICLE 4 TYPE'].apply(lambda x: 1 if x != 0 else 0)
df['VEHICLE 5 TYPE'] = df['VEHICLE 5 TYPE'].apply(lambda x: 1 if x != 0 else 0)

In [4]:
# Create this new column corresponding to number of vehicles invlved in each collision incident
df['Vehicles_Involved'] = df['VEHICLE 1 TYPE'] + df['VEHICLE 2 TYPE'] + df['VEHICLE 3 TYPE'] + df['VEHICLE 4 TYPE'] + df['VEHICLE 5 TYPE']

In [5]:
df.head()

Unnamed: 0,BOROUGH,VEHICLE 1 TYPE,VEHICLE 2 TYPE,VEHICLE 3 TYPE,VEHICLE 4 TYPE,VEHICLE 5 TYPE,Vehicles_Involved
0,QUEENS,1,0,0,0,0,1
1,UNKNOWN,1,0,0,0,0,1
2,BROOKLYN,1,1,0,0,0,2
3,BROOKLYN,1,1,0,0,0,2
4,UNKNOWN,1,1,0,0,0,2


In [6]:
wholedf = pd.DataFrame() #Emtpy DataFrame

#For every unique BOROUGH
for location in df['BOROUGH'].unique():
    myseries = df[df['BOROUGH'] == location].groupby('Vehicles_Involved').size() # Get a series of vehicles involved for each borough
    mydf = myseries.to_frame()
    mydf = mydf.transpose() # Converting series to frame and taking transpose
    mydf.columns = ['UNKNOWN_VEHICLES_INVOLVED', 'ONE_VEHICLE_INVOLVED', 'TWO_VEHICLES_INVOLVED', 'THREE_VEHICLES_INVOLVED', 'FOUR_VEHICLES_INVOLVED', 'FIVE_VEHICLES_INVOLVED']
    mydf['MORE_VEHICLES_INVOLVED'] = mydf['FOUR_VEHICLES_INVOLVED'] + mydf['FIVE_VEHICLES_INVOLVED']
    mydf.drop(['FOUR_VEHICLES_INVOLVED', 'FIVE_VEHICLES_INVOLVED'], axis=1, inplace=True)
    mydf['BOROUGH'] = location
    mydf = mydf[['BOROUGH','ONE_VEHICLE_INVOLVED', 'TWO_VEHICLES_INVOLVED', 'THREE_VEHICLES_INVOLVED', 'MORE_VEHICLES_INVOLVED', 'UNKNOWN_VEHICLES_INVOLVED']]
    wholedf = wholedf.append(mydf) # Add this dataframe to final data frame
wholedf.head()

Unnamed: 0,BOROUGH,ONE_VEHICLE_INVOLVED,TWO_VEHICLES_INVOLVED,THREE_VEHICLES_INVOLVED,MORE_VEHICLES_INVOLVED,UNKNOWN_VEHICLES_INVOLVED
0,QUEENS,12962,70260,4498,1420,515
0,UNKNOWN,22357,104045,9210,2508,1222
0,BROOKLYN,17375,80207,4980,1612,787
0,MANHATTAN,13318,66958,2024,501,801
0,BRONX,8627,34385,1962,625,525


In [7]:
# Dump output to a CSV file
if not os.path.exists("Output"):
    os.makedirs("Output")
wholedf.to_csv('Output/Vehicle_collisions_Part2.csv',index=False)