# Montgomery drivers

In [135]:
from datetime import datetime
import re

import pandas as pd
import numpy as np

In [3]:
drivers_raw = pd.read_csv("../data/montgomery_drivers.csv")

  drivers_raw = pd.read_csv("../data/montgomery_drivers.csv")


#### utils

In [4]:
def describe(column):
    return column.value_counts(dropna=False)

In [5]:
def change_to_unknown(string):
    return 'UNKNOWN' if (string.lower() == 'unknown' or string == 'nan' or string == '') else string

In [6]:
def clean_substance_abuse(substance):
    """ cleans substance abuse column keeping only substance names """
    substance = substance.lower().replace('present', '').replace('contributed', '').replace('detected', '').strip()
    if 'combin' in substance:
        return 'COMBINATION'
    else:
        return substance.upper()

In [7]:
def map_vehicle_type(vehicle):
    """ maps vehicle types to more general ones """
    vehicle = vehicle.lower()
    def in_vehicle(types):
        return any([car_type in vehicle for car_type in types])
    # passenger car
    if in_vehicle(['passenger', 'utility', 'pickup', 'van', 'wagon', 'limousine']) and 'over' not in vehicle:
        return 'PASSENGER'
    elif in_vehicle(['emergency']):
        return 'EMERGENCY'
    elif in_vehicle(['motorcycle', 'moped']):
        return 'MOTORCYCLE'
    elif in_vehicle(['bus']):
        return 'BUS'
    elif in_vehicle(['truck']):
        return 'TRUCK'
    elif in_vehicle(['unknown']):
        return 'UNKNOWN'
    else:
        return 'OTHER'

#### main functions

In [8]:
def prepare_data(data):
    """ selects columns, renames them and changes types """
    data = data[['Report Number', 'Vehicle ID', 'Driver At Fault', 'Injury Severity', 'Driver Substance Abuse', 'Driver Distracted By', 'Vehicle Body Type', 'Vehicle Movement',
                 'Vehicle Going Dir', 'Speed Limit', 'Parked Vehicle', 'Vehicle Year', 'Vehicle Make', 'Vehicle Model']]
    data.columns = ['ReportNumber', 'VehicleID', 'DriverAtFault', 'InjurySeverity', 'DriverSubstanceAbuse', 'DriverDistractedBy', 'VehicleType', 'VehicleMovement',
                 'VehicleGoingDir', 'SpeedLimit', 'ParkedVehicle', 'VehicleYear', 'VehicleMake', 'VehicleModel']
    data = data.astype(str)
    data['VehicleYear'] = data['VehicleYear'].astype(int)
    return data

In [9]:
def handle_nans(data):
    """ handles na values """
    data = data.copy()
    data['SpeedLimit'] = data['SpeedLimit'].fillna(0)
    data['VehicleYear'] = data['VehicleYear'].fillna(0)
    # str columns changed to unknown
    columns_to_unknown = ['DriverSubstanceAbuse', 'DriverDistractedBy', 'VehicleType', 'VehicleMovement', 'VehicleGoingDir', 'VehicleMake', 'VehicleModel']
    for col in columns_to_unknown:
        data[col] = data[col].apply(change_to_unknown)
    return data

In [10]:
def transform_columns(data):
    """ performs neccessary transformations """
    data = data.copy()
    data['DriverAtFault'] = data['DriverAtFault'].apply(lambda x: True if x == 'Yes' else False)
    # boolean if substance contributed
    data['SubstanceAbuseContributed'] = data['DriverSubstanceAbuse'].apply(lambda x: True if 'contributed' in x.lower() else False)
    # clean substance
    data['DriverSubstanceAbuse'] = data['DriverSubstanceAbuse'].apply(clean_substance_abuse)    
    # map vehicle types
    data['VehicleType'] = data['VehicleType'].apply(map_vehicle_type)
    # boolean
    data['ParkedVehicle'] = data['ParkedVehicle'].apply(lambda x: True if x == 'Yes' else False)
    # delete impossible year values
    data['VehicleYear'] = data['VehicleYear'].apply(lambda x: 0 if (x < 1900 or x > (datetime.now().year+1)) else x)
    return data

In [11]:
drivers_prep = prepare_data(drivers_raw)
drivers_nonan = handle_nans(drivers_prep)
drivers = transform_columns(drivers_nonan)

In [12]:
drivers

Unnamed: 0,ReportNumber,VehicleID,DriverAtFault,InjurySeverity,DriverSubstanceAbuse,DriverDistractedBy,VehicleType,VehicleMovement,VehicleGoingDir,SpeedLimit,ParkedVehicle,VehicleYear,VehicleMake,VehicleModel,SubstanceAbuseContributed
0,MCP3170003V,4E492574-893B-4EB1-ADCA-53FDD633D6C4,True,NO APPARENT INJURY,NONE,LOOKED BUT DID NOT SEE,PASSENGER,MOVING CONSTANT SPEED,South,0,False,2017,LEXUS,SUV,False
1,MCP3254003K,6D16232C-4E1E-49A6-B3A2-7FDEF7E506F2,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,West,35,False,2010,TOYT,PRIUS,False
2,EJ7887003Q,82ED056D-33FA-44A5-AA2B-8DCE653FA03C,False,SUSPECTED MINOR INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,South,35,False,2021,SUBARU,FORRESTER,False
3,MCP2674004J,3CD099CA-F5FF-4174-B184-02BCF2C89934,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,EMERGENCY,MOVING CONSTANT SPEED,South,40,False,2019,DODGE,CHARGER,False
4,MCP25280008,64940511-7ACF-4F66-9A1B-5F476A46C2A5,True,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MOVING CONSTANT SPEED,North,20,False,2014,NISSAN,ROGUE,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172100,DM8338000C,81DAD2F7-8D4A-4316-B40D-FAD4A4587B28,False,NO APPARENT INJURY,UNKNOWN,NOT DISTRACTED,EMERGENCY,SLOWING OR STOPPING,East,25,False,2016,FORD,EXPLORER,False
172101,MCP1182001S,DF427BFB-94E2-454E-9C46-82C8E286B992,True,NO APPARENT INJURY,UNKNOWN,LOOKED BUT DID NOT SEE,PASSENGER,SLOWING OR STOPPING,East,35,False,2008,NISSAN,ALTIMA,False
172102,MCP1453008X,CD875BDC-5B10-47F6-B3E7-7C091A71FB5D,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,SLOWING OR STOPPING,North,40,False,2008,CHEVROLET,EXPRESS,False
172103,MCP2568000M,15D3D66B-8337-4E7A-BAE6-855C02AE1CA7,False,NO APPARENT INJURY,NONE,NOT DISTRACTED,PASSENGER,MAKING LEFT TURN,South,35,False,2018,FORD,TRANSIT VAN,False


## Vehicles

In [38]:
vehicles_raw = pd.read_csv("../data/vehicles.csv")

  vehicles_raw = pd.read_csv("../data/vehicles.csv")


In [39]:
vehicles_raw.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'baseModel',
       'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn'

In [73]:
def prepare_vehicles_data(data):
    data = data[['id', 'make', 'baseModel', 'model', 'year', 'VClass', 'cylinders', 'displ', 'trany', 'drive', 'fuelType1', 'city08', 'highway08']]
    data.columns = ['VehicleKey', 'Make', 'BaseModel', 'Model', 'Year', 'BodyClass', 'Cylinders', 'Displacement', 'Transmission', 'Drivetrain', 'FuelType', 'CityMPG', 'HighwayMPG']
    data.astype(str)
    data.loc[:,['VehicleKey', 'Year', 'Cylinders', 'Displacement', 'CityMPG', 'HighwayMPG']] = data[['VehicleKey', 'Year', 'Cylinders', 'Displacement', 'CityMPG', 'HighwayMPG']].astype(float)
    return data.sort_values('VehicleKey')

In [113]:
def handle_nans_vehicles(data):
    data.loc[:,['Cylinders', 'Displacement']] = data[['Cylinders', 'Displacement']].fillna(0)
    columns_to_fill = ['Make', 'BaseModel', 'Model', 'Year', 'BodyClass', 'Transmission', 'Drivetrain', 'FuelType']
    for col in columns_to_fill:
        data[col] = data[col].fillna('Unknown')
    return data

In [126]:
def transform_transmission(trans):
    if trans == 'Unknown':
        return trans
    try:
        gears = re.findall(r'\d+', trans)[0]
    except Exception:
        gears = 'CVT'
    if 'Automatic' in trans:
        return f"Automatic {gears}"
    elif 'Manual' in trans:
        return f"Manual {gears}"
    else:
        return trans

In [131]:
def transform_drivetrain(drive):
    if drive == "Front-Wheel Drive":
        return 'FWD'
    elif drive == "Rear-Wheel Drive":
        return "RWD"
    elif drive in ["4-Wheel or All-Wheel Drive", "All-Wheel Drive"]:
        return "AWD"
    elif drive in ["4-Wheel Drive", "Part-time 4-Wheel Drive"]:
        return "4WD"
    elif drive == '2-Wheel Drive':
        return "2WD"
    else: 
        return drive

In [132]:
def transform_vehicle_data(data):
    data['Transmission'] = data['Transmission'].apply(transform_transmission)
    data['Drivetrain'] = data['Drivetrain'].apply(transform_drivetrain)
    return data

In [133]:
vehicles_prep = prepare_vehicles_data(vehicles_raw)
vehicles_nan = handle_nans_vehicles(vehicles_prep)
vehicles = transform_vehicle_data(vehicles_nan)

In [136]:
vehicles

Unnamed: 0,VehicleKey,Make,BaseModel,Model,Year,BodyClass,Cylinders,Displacement,Transmission,Drivetrain,FuelType,CityMPG,HighwayMPG
0,1,Alfa Romeo,Spider,Spider Veloce 2000,1985,Two Seaters,4.0,2.0,Manual 5,RWD,Regular Gasoline,19,25
11110,2,Bertone,X1/9,X1/9,1985,Two Seaters,4.0,1.5,Manual 5,RWD,Regular Gasoline,20,26
21953,3,Chevrolet,Corvette,Corvette,1985,Two Seaters,8.0,5.7,Automatic 4,RWD,Regular Gasoline,15,21
32964,4,Chevrolet,Corvette,Corvette,1985,Two Seaters,8.0,5.7,Manual 4,RWD,Regular Gasoline,15,20
41766,5,Nissan,300ZX,300ZX,1985,Two Seaters,6.0,3.0,Automatic 4,RWD,Regular Gasoline,15,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41513,47758,Porsche,911,911 S/T,2024,Two Seaters,6.0,4.0,Manual 6,RWD,Premium Gasoline,13,19
41514,47759,Porsche,718,718 Spyder RS,2024,Two Seaters,6.0,4.0,Automatic 7,RWD,Premium Gasoline,14,19
41516,47760,Mercedes-Benz,CLE-Class,CLE300 4matic (Coupe),2024,Subcompact Cars,4.0,2.0,Automatic 9,AWD,Premium Gasoline,24,34
41517,47761,Porsche,Panamera,Panamera 4,2024,Large Cars,6.0,2.9,Automatic 8,AWD,Premium Gasoline,18,25


In [149]:
make_counts = drivers['VehicleMake'].value_counts()
filtered_drivers = drivers[drivers['VehicleMake'].isin(make_counts[make_counts > 10].index)]

In [151]:
filtered_drivers['VehicleMake'].value_counts()

VehicleMake
TOYOTA           23171
HONDA            18870
FORD             17138
TOYT              8840
NISSAN            8525
                 ...  
POSTAL              11
PORCHE              11
FREIGHT LINER       11
TRUCK               11
GENS                11
Name: count, Length: 233, dtype: int64

In [153]:
unique_makes_d = filtered_drivers['VehicleMake'].unique()
with open('unique_makes_to_map.txt', 'w') as file:
    for make in unique_makes_d:
        file.write(f"{make}\n")

In [143]:
unique_makes = vehicles['Make'].unique()
with open('unique_makes.txt', 'w') as file:
    for make in unique_makes:
        file.write(f"{make}\n")

In [140]:
vehicles['Make'].to_csv("marki_dobre.txt", index=False)

In [None]:
import pandas as pd
from difflib import get_close_matches

# Load the data
with open('/mnt/data/unique_makes.txt', 'r') as f:
    correct_makes = [line.strip() for line in f]

with open('/mnt/data/unique_makes_to_map.txt', 'r') as f:
    makes_to_map = [line.strip() for line in f]

# Create a mapping based on closest matches
mapping = []
for make in makes_to_map:
    closest_match = get_close_matches(make, correct_makes, n=1, cutoff=0.6)
    if closest_match:
        mapping.append((make, closest_match[0]))
    else:
        mapping.append((make, "No match found"))

# Save the mapping to a CSV file
df_mapping = pd.DataFrame(mapping, columns=['unique_makes_to_map', 'unique_makes'])
df_mapping.to_csv('/mnt/data/unique_makes_mapping.csv', index=False)
