In [1]:
#imports
import os
import pandas as pd

In [2]:
#get city names
cities_csv_file = os.path.join('historical-hourly-weather-data/city_attributes.csv')

In [3]:
#read file
cities_df = pd.read_csv(cities_csv_file)

In [4]:
#only interested in city and country names
cities_df = cities_df[['City','Country']]

In [5]:
#only need US cities
cities_df = cities_df.loc[cities_df['Country'] == 'United States']

In [6]:
#add states to these cities
cities_df['State'] = ['Oregon', 'California', 'Washington', 'California', 'California', 'Nevada', 'Arizona', 'New Mexico', 'Colorado', 'Texas', 'Texas', 'Texas', 'Missouri', 'Minnesota', 'Missouri', 'Illinois', 'Tennessee', 'Indiana', 'Georgia', 'Michigan', 'Florida', 'North Carolina', 'Florida', 'Pennsylvania', 'Pennsylvania', 'New York', 'Massachusetts']

In [7]:
#get codes
#read file
codes_xl = os.path.join('FRPP GLC United States.xlsx')
codes_df = pd.read_excel(codes_xl, header=1, usecols='B,D,E')

In [8]:
#get ready to merge
cities_df['City'] = cities_df['City'].str.upper()
cities_df['State'] = cities_df['State'].str.upper()
cities_df = cities_df[['City', 'State']]
cities_df

Unnamed: 0,City,State
1,PORTLAND,OREGON
2,SAN FRANCISCO,CALIFORNIA
3,SEATTLE,WASHINGTON
4,LOS ANGELES,CALIFORNIA
5,SAN DIEGO,CALIFORNIA
6,LAS VEGAS,NEVADA
7,PHOENIX,ARIZONA
8,ALBUQUERQUE,NEW MEXICO
9,DENVER,COLORADO
10,SAN ANTONIO,TEXAS


In [9]:
#merge
merged_cities = pd.merge(codes_df, cities_df, left_on=['City Name', 'State Name'], right_on=['City', 'State'])
merged_cities = merged_cities.drop_duplicates()
merged_cities.reset_index(inplace=True)

#only need these columns
merged_cities = merged_cities[['City Code', 'City', 'State']]
merged_cities

Unnamed: 0,City Code,City,State
0,370,PHOENIX,ARIZONA
1,1980,LOS ANGELES,CALIFORNIA
2,3260,SAN DIEGO,CALIFORNIA
3,3290,SAN FRANCISCO,CALIFORNIA
4,600,DENVER,COLORADO
5,1510,JACKSONVILLE,FLORIDA
6,2010,MIAMI,FLORIDA
7,280,ATLANTA,GEORGIA
8,1670,CHICAGO,ILLINOIS
9,2210,INDIANAPOLIS,INDIANA


In [10]:
#get the accident data
fatal_2016_csv = os.path.join('fatal_2016.csv')
fatal_2015_csv = os.path.join('fatal_2015.csv')
fatal_2015_df = pd.read_csv(fatal_2015_csv)
fatal_2016_df = pd.read_csv(fatal_2016_csv)

In [11]:
#get a list of the codes where accidents occurred
fatal_2015_codes = [x for x in fatal_2015_df['city']]

In [12]:
#remove the duplicates from 2015
fatal_2015_codes_trimmed = list(set(fatal_2015_codes))

In [13]:
#remove these duplicates from 2016, too 
fatal_2016_codes = [x for x in fatal_2016_df['city']]
fatal_2016_codes_trimmed = list(set(fatal_2016_codes))

In [14]:
#flatten the list of lists
all_codes = []
all_codes.append(fatal_2015_codes_trimmed)
all_codes.append(fatal_2016_codes_trimmed)
flat_codes = [y for x in all_codes for y in x]

In [15]:
#remove the rest of the dupliactes
flat_codes = list(set(flat_codes))

In [16]:
#remove the codes in codes_trimmed_df that didn't have a fatal accident occur in them
merged_cities = merged_cities.loc[merged_cities['City Code'].isin(flat_codes)]

In [17]:
#df with city, state, and code
merged_cities

Unnamed: 0,City Code,City,State
0,370,PHOENIX,ARIZONA
1,1980,LOS ANGELES,CALIFORNIA
2,3260,SAN DIEGO,CALIFORNIA
3,3290,SAN FRANCISCO,CALIFORNIA
4,600,DENVER,COLORADO
5,1510,JACKSONVILLE,FLORIDA
6,2010,MIAMI,FLORIDA
7,280,ATLANTA,GEORGIA
8,1670,CHICAGO,ILLINOIS
9,2210,INDIANAPOLIS,INDIANA


In [18]:
#get list of codes
codes_list = list(merged_cities['City Code'])

In [19]:
#select occurrences that occurred in the cities listed above
selected_2015 = fatal_2015_df.loc[fatal_2015_df['city'].isin(codes_list)]
selected_2016 = fatal_2016_df.loc[fatal_2016_df['city'].isin(codes_list)]
selected_fatal = selected_2015.append(selected_2016)

In [20]:
#df with all traffic fatalities that occurred in cities during 2015-2016, for which we have weather data
selected_fatal.reset_index(drop=True, inplace=True)
selected_fatal

Unnamed: 0,consecutive_number,state_name,city,day_of_crash,month_of_crash,year_of_crash,day_of_week,hour_of_crash,minute_of_crash,atmospheric_conditions_1,atmospheric_conditions_1_name,atmospheric_conditions_2,atmospheric_conditions_2_name,atmospheric_conditions,atmospheric_conditions_name,number_of_fatalities
0,410415,Oregon,1510,31,12,2015,5,1,35,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1
1,410384,Oregon,1650,9,8,2015,1,2,41,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1
2,40190,Arizona,370,14,3,2015,7,20,18,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1
3,130800,Georgia,280,19,8,2015,4,22,48,8,Other,0,No Additional Atmospheric Conditions,8,Other,1
4,180278,Indiana,2210,22,5,2015,6,22,17,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5803,370994,North Carolina,1730,11,10,2016,3,16,59,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1
5804,40784,Arizona,370,7,11,2016,2,14,23,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1
5805,350158,New Mexico,30,11,5,2016,4,1,2,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1
5806,510505,Virginia,1760,16,9,2016,6,7,37,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1


In [21]:
#row zero above is for Dallas, Oregon. The city code is wrong, because that city with that code isn't in that state
#we'll have to fix this like so:
merged_cities['State'] = merged_cities['State'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
trimmed_fatal = pd.merge(selected_fatal, merged_cities, left_on=['city', 'state_name'], right_on=['City Code', 'State'])
trimmed_fatal

Unnamed: 0,consecutive_number,state_name,city,day_of_crash,month_of_crash,year_of_crash,day_of_week,hour_of_crash,minute_of_crash,atmospheric_conditions_1,atmospheric_conditions_1_name,atmospheric_conditions_2,atmospheric_conditions_2_name,atmospheric_conditions,atmospheric_conditions_name,number_of_fatalities,City Code,City,State
0,410384,Oregon,1650,9,8,2015,1,2,41,98,Not Reported,0,No Additional Atmospheric Conditions,98,Not Reported,1,1650,PORTLAND,Oregon
1,410118,Oregon,1650,17,5,2015,1,19,53,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,2,1650,PORTLAND,Oregon
2,410003,Oregon,1650,1,1,2015,5,18,7,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1,1650,PORTLAND,Oregon
3,410209,Oregon,1650,4,7,2015,7,21,56,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1,1650,PORTLAND,Oregon
4,410243,Oregon,1650,10,8,2015,2,8,57,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1,1650,PORTLAND,Oregon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4986,371174,North Carolina,870,19,11,2016,7,2,18,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,3,870,CHARLOTTE,North Carolina
4987,371082,North Carolina,870,27,10,2016,5,20,8,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1,870,CHARLOTTE,North Carolina
4988,370367,North Carolina,870,29,2,2016,2,16,14,1,Clear,0,No Additional Atmospheric Conditions,1,Clear,1,870,CHARLOTTE,North Carolina
4989,370303,North Carolina,870,2,4,2016,7,4,5,10,Cloudy,0,No Additional Atmospheric Conditions,10,Cloudy,1,870,CHARLOTTE,North Carolina


In [23]:
trimmed_fatal = trimmed_fatal[['City Code', 'year_of_crash', 'month_of_crash', 'day_of_crash', 'hour_of_crash', 'minute_of_crash', 'atmospheric_conditions_name', 'atmospheric_conditions_1_name', 'atmospheric_conditions_2_name', 'number_of_fatalities']]

In [24]:
trimmed_fatal

Unnamed: 0,City Code,year_of_crash,month_of_crash,day_of_crash,hour_of_crash,minute_of_crash,atmospheric_conditions_name,atmospheric_conditions_1_name,atmospheric_conditions_2_name,number_of_fatalities
0,1650,2015,8,9,2,41,Not Reported,Not Reported,No Additional Atmospheric Conditions,1
1,1650,2015,5,17,19,53,Clear,Clear,No Additional Atmospheric Conditions,2
2,1650,2015,1,1,18,7,Clear,Clear,No Additional Atmospheric Conditions,1
3,1650,2015,7,4,21,56,Clear,Clear,No Additional Atmospheric Conditions,1
4,1650,2015,8,10,8,57,Clear,Clear,No Additional Atmospheric Conditions,1
...,...,...,...,...,...,...,...,...,...,...
4986,870,2016,11,19,2,18,Clear,Clear,No Additional Atmospheric Conditions,3
4987,870,2016,10,27,20,8,Clear,Clear,No Additional Atmospheric Conditions,1
4988,870,2016,2,29,16,14,Clear,Clear,No Additional Atmospheric Conditions,1
4989,870,2016,4,2,4,5,Cloudy,Cloudy,No Additional Atmospheric Conditions,1


In [25]:
#commented to avoid recreation of file
#trimmed_fatal.to_csv(path_or_buf=os.path.join('trimmed_fatal.csv')

In [26]:
##commented to avoid recreation of file
#merged_cities.to_csv(path_or_buf=os.path.join('location.csv'))