# Montgomery drivers

In [1]:
from datetime import datetime
import re
from tqdm import tqdm

import pandas as pd
import numpy as np
from difflib import get_close_matches

In [2]:
drivers_raw = pd.read_csv("../data/montgomery_drivers.csv")

  drivers_raw = pd.read_csv("../data/montgomery_drivers.csv")


#### utils

In [3]:
def describe(column):
    return column.value_counts(dropna=False)

In [4]:
def change_to_unknown(string):
    return 'UNKNOWN' if (string.lower() == 'unknown' or string == 'nan' or string == '') else string

In [5]:
def clean_substance_abuse(substance):
    """ cleans substance abuse column keeping only substance names """
    substance = substance.lower().replace('present', '').replace('contributed', '').replace('detected', '').strip()
    if 'combin' in substance:
        return 'COMBINATION'
    else:
        return substance.upper()

In [6]:
def map_vehicle_type(vehicle):
    """ maps vehicle types to more general ones """
    vehicle = vehicle.lower()
    def in_vehicle(types):
        return any([car_type in vehicle for car_type in types])
    # passenger car
    if in_vehicle(['passenger', 'utility', 'pickup', 'van', 'wagon', 'limousine']) and 'over' not in vehicle:
        return 'PASSENGER'
    elif in_vehicle(['emergency']):
        return 'EMERGENCY'
    elif in_vehicle(['motorcycle', 'moped']):
        return 'MOTORCYCLE'
    elif in_vehicle(['bus']):
        return 'BUS'
    elif in_vehicle(['truck']):
        return 'TRUCK'
    elif in_vehicle(['unknown']):
        return 'UNKNOWN'
    else:
        return 'OTHER'

#### main functions

In [14]:
def prepare_data(data):
    """ selects columns, renames them and changes types """
    data = data[['Report Number', 'Vehicle ID', 'Driver At Fault', 'Injury Severity', 'Driver Substance Abuse', 'Driver Distracted By', 'Vehicle Body Type', 'Vehicle Movement',
                 'Vehicle Going Dir', 'Vehicle Damage Extent', 'Speed Limit', 'Parked Vehicle', 'Vehicle Year', 'Vehicle Make', 'Vehicle Model']]
    data.columns = ['ReportNumber', 'VehicleCrashKey', 'DriverAtFault', 'DriverInjurySeverity', 'DriverSubstanceAbuse', 'DriverDistractedBy', 'VehicleType', 'VehicleMovement',
                 'VehicleGoingDir', 'VehicleDamageExtent', 'SpeedLimit', 'ParkedVehicle', 'VehicleYear', 'VehicleMake', 'VehicleModel']
    data = data.astype(str)
    data['VehicleYear'] = data['VehicleYear'].astype(int)
    return data

In [15]:
def handle_nans(data):
    """ handles na values """
    data = data.copy()
    data['SpeedLimit'] = data['SpeedLimit'].fillna(0)
    data['VehicleYear'] = data['VehicleYear'].fillna(0)
    # str columns changed to unknown
    columns_to_unknown = ['DriverSubstanceAbuse', 'DriverDistractedBy', 'VehicleType', 'VehicleMovement', 'VehicleGoingDir',
                          'VehicleDamageExtent', 'VehicleMake', 'VehicleModel']
    for col in columns_to_unknown:
        data[col] = data[col].apply(change_to_unknown)
    return data

In [16]:
def transform_columns(data):
    """ performs neccessary transformations """
    data = data.copy()
    data['DriverAtFault'] = data['DriverAtFault'].apply(lambda x: True if x == 'Yes' else False)
    # boolean if substance contributed
    data['SubstanceAbuseContributed'] = data['DriverSubstanceAbuse'].apply(lambda x: True if 'contributed' in x.lower() else False)
    # clean substance
    data['DriverSubstanceAbuse'] = data['DriverSubstanceAbuse'].apply(clean_substance_abuse)    
    # map vehicle types
    data['VehicleType'] = data['VehicleType'].apply(map_vehicle_type)
    # boolean
    data['ParkedVehicle'] = data['ParkedVehicle'].apply(lambda x: True if x == 'Yes' else False)
    # delete impossible year values
    data['VehicleYear'] = data['VehicleYear'].apply(lambda x: 0 if (x < 1900 or x > (datetime.now().year+1)) else x)
    return data

In [17]:
drivers_prep = prepare_data(drivers_raw)
drivers_nonan = handle_nans(drivers_prep)
drivers_safe = transform_columns(drivers_nonan)

In [18]:
drivers_safe

Unnamed: 0,ReportNumber,VehicleCrashKey,DriverAtFault,DriverInjurySeverity,DriverSubstanceAbuse,DriverDistractedBy,VehicleType,VehicleMovement,VehicleGoingDir,VehicleDamageExtent,SpeedLimit,ParkedVehicle,VehicleYear,VehicleMake,VehicleModel,SubstanceAbuseContributed
0,MCP3170003V,4E492574-893B-4EB1-ADCA-53FDD633D6C4,True,NO APPARENT INJURY,NONE,LOOKED BUT DID NOT SEE,PASSENGER,MOVING CONSTANT SPEED,South,FUNCTIONAL,0,False,2017,LEXUS,SUV,False
1,MCP3254003K,6D16232C-4E1E-49A6-B3A2-7FDEF7E506F2,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,West,FUNCTIONAL,35,False,2010,TOYT,PRIUS,False
2,EJ7887003Q,82ED056D-33FA-44A5-AA2B-8DCE653FA03C,False,SUSPECTED MINOR INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,South,FUNCTIONAL,35,False,2021,SUBARU,FORRESTER,False
3,MCP2674004J,3CD099CA-F5FF-4174-B184-02BCF2C89934,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,EMERGENCY,MOVING CONSTANT SPEED,South,FUNCTIONAL,40,False,2019,DODGE,CHARGER,False
4,MCP25280008,64940511-7ACF-4F66-9A1B-5F476A46C2A5,True,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,North,DISABLING,20,False,2014,NISSAN,ROGUE,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172100,DM8338000C,81DAD2F7-8D4A-4316-B40D-FAD4A4587B28,False,NO APPARENT INJURY,UNKNOWN,NOT DISTRACTED,EMERGENCY,SLOWING OR STOPPING,East,SUPERFICIAL,25,False,2016,FORD,EXPLORER,False
172101,MCP1182001S,DF427BFB-94E2-454E-9C46-82C8E286B992,True,NO APPARENT INJURY,UNKNOWN,LOOKED BUT DID NOT SEE,PASSENGER,SLOWING OR STOPPING,East,DISABLING,35,False,2008,NISSAN,ALTIMA,False
172102,MCP1453008X,CD875BDC-5B10-47F6-B3E7-7C091A71FB5D,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,SLOWING OR STOPPING,North,FUNCTIONAL,40,False,2008,CHEVROLET,EXPRESS,False
172103,MCP2568000M,15D3D66B-8337-4E7A-BAE6-855C02AE1CA7,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MAKING LEFT TURN,South,FUNCTIONAL,35,False,2018,FORD,TRANSIT VAN,False


## Vehicles

In [19]:
vehicles_raw = pd.read_csv("../data/vehicles.csv")

  vehicles_raw = pd.read_csv("../data/vehicles.csv")


In [20]:
vehicles_raw.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'baseModel',
       'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn'

In [21]:
def prepare_vehicles_data(data):
    data = data[['id', 'make', 'baseModel', 'model', 'year', 'VClass', 'cylinders', 'displ', 'trany', 'drive', 'fuelType1', 'city08', 'highway08']]
    data.columns = ['VehicleKey', 'Make', 'BaseModel', 'Model', 'Year', 'BodyClass', 'Cylinders', 'Displacement', 'Transmission', 'Drivetrain', 'FuelType', 'CityMPG', 'HighwayMPG']
    data.astype(str)
    data.loc[:,['VehicleKey', 'Year', 'Cylinders', 'Displacement', 'CityMPG', 'HighwayMPG']] = data[['VehicleKey', 'Year', 'Cylinders', 'Displacement', 'CityMPG', 'HighwayMPG']].astype(float)
    return data.sort_values('VehicleKey')

In [22]:
def handle_nans_vehicles(data):
    data.loc[:,['Cylinders', 'Displacement']] = data[['Cylinders', 'Displacement']].fillna(0)
    columns_to_fill = ['Make', 'BaseModel', 'Model', 'Year', 'BodyClass', 'Transmission', 'Drivetrain', 'FuelType']
    for col in columns_to_fill:
        data[col] = data[col].fillna('Unknown')
    return data

In [23]:
def transform_transmission(trans):
    if trans == 'Unknown':
        return trans
    try:
        gears = re.findall(r'\d+', trans)[0]
    except Exception:
        gears = 'CVT'
    if 'Automatic' in trans:
        return f"Automatic {gears}"
    elif 'Manual' in trans:
        return f"Manual {gears}"
    else:
        return trans

In [24]:
def transform_drivetrain(drive):
    if drive == "Front-Wheel Drive":
        return 'FWD'
    elif drive == "Rear-Wheel Drive":
        return "RWD"
    elif drive in ["4-Wheel or All-Wheel Drive", "All-Wheel Drive"]:
        return "AWD"
    elif drive in ["4-Wheel Drive", "Part-time 4-Wheel Drive"]:
        return "4WD"
    elif drive == '2-Wheel Drive':
        return "2WD"
    else: 
        return drive

In [25]:
def generate_blank_models(data):
    data = data.copy()
    # brands_dict z car_makes.txt
    makes = list(data['Make'].unique()) + list(brands_dict.values())
    makes_unique = np.unique(makes)
    for make in makes_unique:
        row = {'Make': make, 'Year': 0, 'BaseModel': "Unknown", 'VehicleKey': '', 'BodyClass': '', 'Cylinders': 0, 'Displacement': 0,
               'Transmission': '', 'Drivetrain': '', 'FuelType': '', 'CityMPG': 0,	'HighwayMPG': 0}
        data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
    return data

In [26]:
def aggregate_models(data):
    return data.drop(['Model'], axis=1).groupby(['Make', 'Year', 'BaseModel']).agg(lambda x: x.mode().iloc[0]).reset_index()

In [27]:
def generate_vehicle_key(make, model, year):
    make = make.replace(' ', '')
    model = model.replace(' ', '')
    return f"{make}{model}{year}"

In [28]:
def transform_vehicle_data(data):
    data['Transmission'] = data['Transmission'].apply(transform_transmission)
    data['Drivetrain'] = data['Drivetrain'].apply(transform_drivetrain)
    # generate blank objects for each brand
    data = generate_blank_models(data)
    # aggregate models
    data = aggregate_models(data)
    # generate keys
    data['VehicleKey'] = data.apply(lambda x: generate_vehicle_key(x.Make, x.BaseModel, x.Year), axis=1)
    return data

In [31]:
vehicles_prep = prepare_vehicles_data(vehicles_raw)
vehicles_nan = handle_nans_vehicles(vehicles_prep)
vehicles = transform_vehicle_data(vehicles_nan)

In [32]:
vehicles

Unnamed: 0,Make,Year,BaseModel,VehicleKey,BodyClass,Cylinders,Displacement,Transmission,Drivetrain,FuelType,CityMPG,HighwayMPG
0,AM General,0,Unknown,AMGeneralUnknown0,,0.0,0.0,,,,0,0
1,AM General,1984,DJ Po Vehicle,AMGeneralDJPoVehicle1984,Special Purpose Vehicle 2WD,4.0,2.5,Automatic 3,2WD,Regular Gasoline,18,17
2,AM General,1984,FJ8c Post Office,AMGeneralFJ8cPostOffice1984,Special Purpose Vehicle 2WD,6.0,4.2,Automatic 3,2WD,Regular Gasoline,13,13
3,AM General,1985,Post Office DJ5,AMGeneralPostOfficeDJ51985,Special Purpose Vehicle 2WD,4.0,2.5,Automatic 3,RWD,Regular Gasoline,16,17
4,AM General,1985,Post Office DJ8,AMGeneralPostOfficeDJ81985,Special Purpose Vehicle 2WD,6.0,4.2,Automatic 3,RWD,Regular Gasoline,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...
11632,smart,2015,fortwo,smartfortwo2015,Two Seaters,0.0,0.0,Automatic 1,RWD,Electricity,34,38
11633,smart,2016,fortwo,smartfortwo2016,Two Seaters,0.0,0.0,Automatic 1,RWD,Electricity,122,39
11634,smart,2017,fortwo,smartfortwo2017,Two Seaters,3.0,0.9,Automatic 1,RWD,Premium Gasoline,31,38
11635,smart,2018,fortwo,smartfortwo2018,Two Seaters,0.0,0.0,Automatic 1,RWD,Electricity,112,91


### Creating mappers

In [149]:
make_counts = drivers['VehicleMake'].value_counts()
filtered_drivers = drivers[drivers['VehicleMake'].isin(make_counts[make_counts > 10].index)]

In [151]:
filtered_drivers['VehicleMake'].value_counts()

VehicleMake
TOYOTA           23171
HONDA            18870
FORD             17138
TOYT              8840
NISSAN            8525
                 ...  
POSTAL              11
PORCHE              11
FREIGHT LINER       11
TRUCK               11
GENS                11
Name: count, Length: 233, dtype: int64

In [153]:
unique_makes_d = filtered_drivers['VehicleMake'].unique()
with open('unique_makes_to_map.txt', 'w') as file:
    for make in unique_makes_d:
        file.write(f"{make}\n")

In [143]:
unique_makes = vehicles['Make'].unique()
with open('unique_makes.txt', 'w') as file:
    for make in unique_makes:
        file.write(f"{make}\n")

#### mapping

In [30]:
""" create dictionary for searching  """
cars_mapper = pd.read_csv("../data/car_makes.txt")
brands_dict = cars_mapper.set_index('unique_makes_to_map')['unique_makes'].to_dict()

In [33]:
""" create dictionary for searching models 

aggregated vehicles data with columns: year, make, basemodel needs to be kept as static file and updated when new cars are fetched.
for example load to dataframe, concatenate with new ones, drop duplicates and save again. also if in new data there are duplicates 
(make/model/year - can be checked with out static file) then drop them. 
"""

models_dict = {}

for index, row in tqdm(vehicles.iterrows()):
    
    key = (row['Year'], row['Make'])
    
    if key not in models_dict:
        models_dict[key] = []
    
    models_dict[key].append(row['BaseModel'])

11637it [00:00, 15517.81it/s]


In [34]:
def map_makes(make):
    """ maps crash data brands to fueleconomy car brands"""
    try:
        new_make = brands_dict[make]
    except KeyError:
        new_make = 'No match found'
        
    if new_make == 'No match found':
        makes_lower = [m.lower() for m in list(brands_dict.values())]
        found_makes = get_close_matches(make.lower(), makes_lower, n=1, cutoff=0.5)
        
        if len(found_makes) == 0:
            new_make = 'Unknown'
        else:
            found_make_lower = found_makes[0]
            found_make_original = list(brands_dict.values())[makes_lower.index(found_make_lower)]
            return found_make_original
            
    return new_make

In [35]:
def map_models(model, make, year):
    """ maps crash data models to fueleconomy car models """
    if model.lower() in ['4s', 'tk']:
        return 'Unknown'

    try:
        models_raw = list(set(models_dict[(year, make)]))
        models_lower = [m.lower() for m in models_raw]
        found_models_lower = get_close_matches(model.lower(), models_lower, n=1, cutoff=0.2)
    except KeyError:
        return 'Unknown'

    # if make == 'BMW' and model only numbers, take first digit. map 3 series -> 3 etc 
    
    if len(found_models_lower) == 0:
        return 'Unknown'
        
    found_model_lower = found_models_lower[0]
    found_model_original = models_raw[models_lower.index(found_model_lower)]
    return found_model_original


In [36]:
def map_year(year, model):
    if model == 'Unknown':
        return 0
    else:
        return year

#### mapping pipeline

In [37]:
def mapping_pipeline(data):
    data = data.copy()
    data['MappedMake'] = data['VehicleMake'].apply(map_makes)
    data['MappedModel'] = data.apply(lambda x: map_models(x.VehicleModel, x.MappedMake, x.VehicleYear), axis=1)
    data['MappedYear'] = data.apply(lambda x: map_year(x.VehicleYear, x.MappedModel), axis=1)
    data['VehicleKey'] = data.apply(lambda x: generate_vehicle_key(x.MappedMake, x.MappedModel, x.MappedYear), axis=1)
    data = data.drop(['VehicleYear', 'VehicleMake', 'VehicleModel', 'MappedMake', 'MappedModel', 'MappedYear'], axis=1)
    return data

In [38]:
drivers_pipe = mapping_pipeline(drivers_safe)

In [46]:
# calculate vehicles crashed total
crashed_total = drivers_pipe.groupby('ReportNumber').agg(
    VehiclesCrashedTotal=pd.NamedAgg('ReportNumber', 'count')).reset_index()

In [47]:
drivers_final = drivers_pipe.merge(crashed_total, on='ReportNumber')

In [48]:
drivers_final.head(3)

Unnamed: 0,ReportNumber,VehicleCrashKey,DriverAtFault,DriverInjurySeverity,DriverSubstanceAbuse,DriverDistractedBy,VehicleType,VehicleMovement,VehicleGoingDir,VehicleDamageExtent,SpeedLimit,ParkedVehicle,SubstanceAbuseContributed,VehicleKey,VehiclesCrashedTotal
0,MCP3170003V,4E492574-893B-4EB1-ADCA-53FDD633D6C4,True,NO APPARENT INJURY,NONE,LOOKED BUT DID NOT SEE,PASSENGER,MOVING CONSTANT SPEED,South,FUNCTIONAL,0,False,False,LexusLS2017,2
1,MCP3170003V,904F2114-2B56-46D9-B105-356073FA1D8A,True,NO APPARENT INJURY,NONE,LOOKED BUT DID NOT SEE,BUS,MOVING CONSTANT SPEED,North,NO DAMAGE,0,False,False,FreightlinerUnknown0,2
2,MCP3254003K,6D16232C-4E1E-49A6-B3A2-7FDEF7E506F2,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,West,FUNCTIONAL,35,False,False,ToyotaPrius2010,1


In [49]:
drivers_final.to_csv("../data/dwh/DriversFact.csv", index=False)