# PROJECT 5 
Kehinde Ajayi

## CREATE MULTI-YEAR DATAFRAMES FROM FARS DATA

In [2]:
import numpy as np
import pandas as pd

import os



In [15]:
# function for merging data

def get_merged_dataframes(main_df, main_df_columns, csv_files, year, merge_columns):
    
    df1 = pd.read_csv(f'./data/{year}_fars/{main_df}.csv', encoding='unicode_escape')
    df1.columns = df1.columns.str.lower()
    merged_dataframe = df1[df1.columns.intersection(main_df_columns)]
    
    for csv in csv_files:
        try:
            df2 = pd.read_csv(f'./data/{year}_fars/{csv}.csv')
            df2.columns = df2.columns.str.lower() 
            merged_dataframe = pd.merge(merged_dataframe, df2, how='outer', on=merge_columns, suffixes=('', '_remove'))
            merged_dataframe = merged_dataframe.drop([column for column in merged_dataframe.columns if 'remove' in column], axis=1)
        except FileNotFoundError:
            print(f'The {csv} file does not exist for the year {year}.')           
    
    merged_dataframe.loc[:,'data_year'] = year
    
    return merged_dataframe

#### Merge all person-level data

In [7]:
def combine_person_data(start_year, end_year=None):  
    
    person_columns = ['state', 'st_case', 'veh_no', 'per_no', 'county', 'age', 'sex', 
                      'per_typ', 'inj_sev', 'seat_pos', 'rest_use', 'rest_mis', 'air_bag', 'ejection',
                      'ej_path', 'extricat', 'drinking', 'alc_det', 'alc_status', 'atst_typ', 'alc_res',
                      'drugs', 'drug_det', 'dstatus', 'hospital', 'doa', 'death_da', 'death_mo', 
                      'death_yr', 'death_hr', 'death_mn', 'death_tm', 'n_mot_no', 'location',
                      'helm_use', 'helm_mis', 'str_veh']
    
    csvs_to_merge = ['nmprior', 'nmcrash', 'nmimpair',
                       'safetyeq', 'nmdistract', 'drugs']
    
    merge_columns = ['state', 'st_case', 'veh_no', 'per_no']
    
    if end_year is None:
        end_year = start_year
    
    full_df_list = [get_merged_dataframes('person', person_columns, csvs_to_merge, year, merge_columns) for year in range(start_year, end_year + 1)]
    
    return pd.concat(full_df_list, ignore_index=True)
        

In [None]:
      
full_person_df = combine_person_data(2010, 2020)


In [None]:
full_person_df.shape

In [None]:
full_person_df.head()

#### Merge all driver-level data

In [18]:
def combine_driver_data(start_year, end_year=None):  
    
    driver_columns = ['state', 'st_case', 'veh_no', 'dr_pres', 'l_state', 'dr_zip', 'l_type', 'l_status', 
                      'cdl_stat', 'l_endors', 'l_compl', 'l_restri', 'dr_hgt', 'dr_wgt', 'prev_oth',
                      'first_mo', 'first_yr', 'last_mo', 'last_yr', 'speedrel' ]
          
    csvs_to_merge = ['violatn', 'drimpair', 'driverrf'] 
    
    merge_columns = ['state', 'st_case', 'veh_no']
    
    if end_year is None:
        end_year = start_year
    
    full_df_list = [get_merged_dataframes('vehicle', driver_columns, csvs_to_merge, year, merge_columns) for year in range(start_year, end_year + 1)]
    
    return pd.concat(full_df_list, ignore_index=True)
        

In [None]:
      
full_driver_df = combine_driver_data(2010, 2020)


In [None]:
full_driver_df.shape

In [None]:
full_driver_df.head()

### Save multi-year person- and driver-level data in separate .csv files

In [18]:
full_person_df.to_csv('./data/person-level_data_2010-2020.csv', index=False)

In [19]:
full_driver_df.to_csv('./data/driver-level_data_2010-2020.csv', index=False)