In [1]:
#imports
import os
import pandas as pd

In [2]:
#get city names
cities_csv_file = os.path.join('historical-hourly-weather-data/city_attributes.csv')

In [3]:
#read file
cities_df = pd.read_csv(cities_csv_file)

In [4]:
#only interested in city and country names
cities_df = cities_df[['City','Country']]

In [5]:
#only need US cities
cities_df = cities_df.loc[cities_df['Country'] == 'United States']

In [6]:
#add states to these cities
cities_df['State'] = ['Oregon', 'California', 'Washington', 'California', 'California', 'Nevada', 'Arizona', 'New Mexico', 'Colorado', 'Texas', 'Texas', 'Texas', 'Missouri', 'Minnesota', 'Missouri', 'Illinois', 'Tennessee', 'Indiana', 'Georgia', 'Michigan', 'Florida', 'North Carolina', 'Florida', 'Pennsylvania', 'Pennsylvania', 'New York', 'Massachusetts']

In [7]:
#get codes
#read file
codes_xl = os.path.join('FRPP GLC United States.xlsx')
codes_df = pd.read_excel(codes_xl, header=1, usecols='B,D,E')

In [8]:
#get ready to merge
cities_df['City'] = cities_df['City'].str.upper()
cities_df['State'] = cities_df['State'].str.upper()
cities_df = cities_df[['City', 'State']]

In [9]:
#merge
merged_cities = pd.merge(codes_df, cities_df, left_on=['City Name', 'State Name'], right_on=['City', 'State'])
merged_cities = merged_cities.drop_duplicates()
merged_cities.reset_index(inplace=True)

#only need these columns
merged_cities = merged_cities[['City Code', 'City', 'State']]

In [10]:
#get the accident data
fatal_2016_csv = os.path.join('fatal_2016.csv')
fatal_2015_csv = os.path.join('fatal_2015.csv')
fatal_2015_df = pd.read_csv(fatal_2015_csv)
fatal_2016_df = pd.read_csv(fatal_2016_csv)

In [11]:
#get a list of the codes where accidents occurred in 2015 and 2016
fatal_2015_codes = [x for x in fatal_2015_df['city']]
fatal_2016_codes = [x for x in fatal_2016_df['city']]

In [12]:
#create one list and then flatten the list of lists
all_codes = []
all_codes.append(fatal_2015_codes)
all_codes.append(fatal_2016_codes)
flat_codes = [y for x in all_codes for y in x]

In [13]:
#remove the duplicates
flat_codes = list(set(flat_codes))

In [14]:
#remove the codes in merged_cities that didn't have a fatal accident occur in them
merged_cities = merged_cities.loc[merged_cities['City Code'].isin(flat_codes)]

In [15]:
#get list of codes
codes_list = list(merged_cities['City Code'])

In [16]:
#select occurrences that occurred in the cities listed above
selected_2015 = fatal_2015_df.loc[fatal_2015_df['city'].isin(codes_list)]
selected_2016 = fatal_2016_df.loc[fatal_2016_df['city'].isin(codes_list)]
selected_fatal = selected_2015.append(selected_2016)

In [17]:
#df with all traffic fatalities that occurred in cities during 2015-2016, for which we have weather data
selected_fatal.reset_index(drop=True, inplace=True)

In [18]:
#row zero above is for North Bend, Oregon. City codes are re-used across states, but we only have the 26 cities of interest.
#we'll have to fix this like so:
merged_cities['State'] = merged_cities['State'].str.title()
trimmed_fatal = pd.merge(selected_fatal, merged_cities, left_on=['city', 'state_name'], right_on=['City Code', 'State'])
trimmed_fatal = trimmed_fatal[['City Code', 'year_of_crash', 'month_of_crash', 'day_of_crash', 'hour_of_crash', 'atmospheric_conditions_name', 'atmospheric_conditions_1_name', 'atmospheric_conditions_2_name', 'number_of_fatalities']]

In [19]:
#format: '10/1/2012 12:00'    
def datetime_maker(row):
    test_string = f"{row['month_of_crash']}/{row['day_of_crash']}/{row['year_of_crash']} {row['hour_of_crash']}:00"
    return test_string

In [20]:
# create series of datetime strings
trimmed_fatal['crash_time'] = trimmed_fatal.apply(datetime_maker, axis=1)

In [21]:
#select columns
trimmed_fatal = trimmed_fatal[['City Code', 'crash_time', 'atmospheric_conditions_name', 'atmospheric_conditions_1_name','atmospheric_conditions_2_name', 'number_of_fatalities']]

In [22]:
#comment to avoid recreation of file
#trimmed_fatal.to_csv(path_or_buf=os.path.join('fatalities.csv'), index=False)

In [23]:
#comment to avoid recreation of file
merged_cities['City'] = merged_cities['City'].str.title()
#merged_cities.to_csv(path_or_buf=os.path.join('location.csv'), index=False)