<div style='background-color:orange'>
<a id="TableOfContents"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            TABLE OF CONTENTS:
        </i></b></h1>
    <li><a href='#imports'>Imports</a>
    <li><a href="#acquire">Acquire</a>
    <li><a href='#prepare'>Prepare</a>
    <li><a href="#wrangle">Wrangle</a>
    <li><a href='#misc'>Miscellaneous</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="imports"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Imports
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [1]:
# Vectorization and tables
import numpy as np
import pandas as pd

# Regex
import re

# .py files
import wrangle as w
import acquire as a

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

<div style='background-color:orange'>
<a id="acquire"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Acquire
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    <li><a href='#acquiregabe'>Initial Filter</a>
    <li><a href='#acquiresecond'>Second Filter</a>
    <li><a href='#acquirethird'>Third Filter</a>
    </li>
</div>

<a id='acquiregabe'></a>
<h3><b><i>
    Initial Filter
</i></b></h3>
<li><a href='#acquire'>Acquire Top</a></li>

Replicate Gabe's 'master_modeling.csv' file with the crash ID's

In [2]:
# Use Fermin's initial acquire function to filter off of
thingy = a.acquire_motocycle_data()
thingy.shape

(14184, 28)

In [3]:
# Remove duplicated columns and remove '_x'
remove_col_list = [col for col in thingy.columns.to_list() if col.endswith('_y')]
keep_col_list = [col for col in thingy.columns.to_list() if col not in remove_col_list]
new = thingy[keep_col_list]
clean_col_list = [col.replace('_x', '') for col in keep_col_list]
new.columns = clean_col_list

In [4]:
# Remove null rows
new = new[~new.crash_id.isna()]

In [5]:
# Replicate kept columns
filter_cols = [
    'person_age',
    'person_ethnicity',
    'person_gender',
    'has_motocycle_endorsment',
    'person_injury_severity',
    'vehicle_body_style',
    'vehicle_color',
    'vehicle_defect_1',
    'vehicle_make',
    'vehicle_model_name',
    'vehicle_model_year'
]
filter_cols.insert(0, 'crash_id')
new = new[filter_cols]

In [6]:
# Fill null values
for col in new:
    most_frequent_value = new[col].mode()[0]
    new[col].fillna(most_frequent_value, inplace=True)
new.isna().sum()

crash_id                    0
person_age                  0
person_ethnicity            0
person_gender               0
has_motocycle_endorsment    0
person_injury_severity      0
vehicle_body_style          0
vehicle_color               0
vehicle_defect_1            0
vehicle_make                0
vehicle_model_name          0
vehicle_model_year          0
dtype: int64

In [7]:
# Correct Dtypes
new.crash_id = new.crash_id.astype(int)
new.person_age = new.person_age.astype(int)

---

<a id='acquiresecond'></a>
<h3><b><i>
    Second Filter
</i></b></h3>
<li><a href='#acquire'>Acquire Top</a></li>

Replicate Gabe's 'master_modeling_updated.csv'

In [8]:
df = new

In [9]:
# Remove the 'vehicle_defect_1' column
df.drop('vehicle_defect_1', axis=1, inplace=True)
# Impute 'other' in 'vehicle_make' with the most frequent value
vehicle_make_most_frequent = df['vehicle_make'].value_counts().idxmax()
df['vehicle_make'] = df['vehicle_make'].replace('other', vehicle_make_most_frequent)
# Remove any text in parenthesis and any whitespace in 'vehicle_model_name'
df['vehicle_model_name'] = df['vehicle_model_name'].str.replace(r'\(.*\)', '', regex=True)
df['vehicle_model_name'] = df['vehicle_model_name'].str.strip()
# For 'unknown' in 'vehicle_model_name', impute the most frequent 'vehicle_model_name' for that 'vehicle_make'
unknown_model_indices = df[df['vehicle_model_name'] == 'unknown'].index
for idx in unknown_model_indices:
    vehicle_make = df.loc[idx, 'vehicle_make']
    most_frequent_model = df[df['vehicle_make'] == vehicle_make]['vehicle_model_name'].value_counts().idxmax()
    df.loc[idx, 'vehicle_model_name'] = most_frequent_model
# Count of changes made
changes_made = len(df) - df['vehicle_make'].value_counts()[vehicle_make_most_frequent] + len(unknown_model_indices)
changes_made
# Rename 'vehicle_model_name' to 'vehicle_model'
df.rename(columns={'vehicle_model_name': 'vehicle_model'}, inplace=True)
# Count of rows in 'vehicle_model' that contain 'other'
other_count = df[df['vehicle_model'].str.contains('other')].shape[0]
other_count
# For 'other' in 'vehicle_model', impute the most frequent 'vehicle_model' for that 'vehicle_make'
other_model_indices = df[df['vehicle_model'].str.contains('other')].index
for idx in other_model_indices:
    vehicle_make = df.loc[idx, 'vehicle_make']
    most_frequent_model = df[df['vehicle_make'] == vehicle_make]['vehicle_model'].value_counts().idxmax()
    df.loc[idx, 'vehicle_model'] = most_frequent_model
# Count of changes made
changes_made = len(other_model_indices)
changes_made
# Convert 'vehicle_model_year' to string type
df['vehicle_model_year'] = df['vehicle_model_year'].astype(str)
# Remove '.0' from the end of every row in 'vehicle_model_year'
df['vehicle_model_year'] = df['vehicle_model_year'].str.rstrip('.0')
# Save the DataFrame to 'master_modeling_updated.csv'
df.to_csv('master_modeling_updated.csv', index=False)
df = pd.read_csv('master_modeling_updated.csv')
# Impute 'other (explain in narrative)' and 'unknown' in 'vehicle_make' with the most frequent value
vehicle_make_most_frequent = df['vehicle_make'].value_counts().idxmax()
df['vehicle_make'] = df['vehicle_make'].replace(['other (explain in narrative)', 'unknown'], vehicle_make_most_frequent)
# Count of changes made
changes_made = df['vehicle_make'].value_counts()[vehicle_make_most_frequent] - len(df[df['vehicle_make'] == vehicle_make_most_frequent])
changes_made
# For 'other' or 'unknown' in 'vehicle_model', impute the most frequent 'vehicle_model' for that 'vehicle_make'
other_unknown_model_indices = df[df['vehicle_model'].str.contains('other|unknown')].index
for idx in other_unknown_model_indices:
    vehicle_make = df.loc[idx, 'vehicle_make']
    most_frequent_model = df[df['vehicle_make'] == vehicle_make]['vehicle_model'].value_counts().idxmax()
    second_most_frequent_model = df[df['vehicle_make'] == vehicle_make]['vehicle_model'].value_counts().index[1] if len(df[df['vehicle_make'] == vehicle_make]['vehicle_model'].value_counts()) > 1 else most_frequent_model
    if 'other' in most_frequent_model or 'unknown' in most_frequent_model:
        df.loc[idx, 'vehicle_model'] = second_most_frequent_model
    else:
        df.loc[idx, 'vehicle_model'] = most_frequent_model
# Count of changes made
changes_made = len(other_unknown_model_indices)
changes_made

3630

In [10]:
df

Unnamed: 0,crash_id,person_age,person_ethnicity,person_gender,has_motocycle_endorsment,person_injury_severity,vehicle_body_style,vehicle_color,vehicle_make,vehicle_model,vehicle_model_year
0,16189632,37,w - white,1 - male,0,a - suspected serious injury,mc - motorcycle,blu - blue,harley-davidson,fld,2007
1,16203470,30,h - hispanic,1 - male,0,b - suspected minor injury,mc - motorcycle,gry - gray,suzuki,gsx-r600,2004
2,16192023,21,w - white,1 - male,0,a - suspected serious injury,mc - motorcycle,blu - blue,yamaha,yzfr6,2017
3,16196720,18,h - hispanic,1 - male,0,b - suspected minor injury,mc - motorcycle,blu - blue,yamaha,rz500,2002
4,16189103,28,w - white,1 - male,1,b - suspected minor injury,mc - motorcycle,blk - black,harley-davidson,fxdf,2009
...,...,...,...,...,...,...,...,...,...,...,...
14129,19321499,49,w - white,2 - female,1,b - suspected minor injury,mc - motorcycle,blk - black,yamaha,xvs1100,2013
14130,19323296,33,w - white,1 - male,1,a - suspected serious injury,mc - motorcycle,grn - green,kawasaki,kx250,2019
14131,19327850,35,w - white,1 - male,1,a - suspected serious injury,mc - motorcycle,blk - black,honda,cr250,2016
14132,19330330,42,b - black,2 - female,1,b - suspected minor injury,mc - motorcycle,mul - multicolored,honda,cbr650,2016


---

<a id='acquirethird'></a>
<h3><b><i>
    Third Filter
</i></b></h3>
<li><a href='#acquire'>Acquire Top</a></li>

Replicate Gabe's 'master_modeling_updated1.csv'

In [11]:
# Create a dictionary mapping vehicle makes to countries
make_country = {
    'honda': 'Japan',
    'yamaha': 'Japan',
    'suzuki': 'Japan',
    'kawasaki': 'Japan',
    'harley-davidson': 'USA',
    'bmw': 'Germany',
    'ducati': 'Italy',
    'triumph': 'UK',
    'ktm': 'Austria',
    'aprilia': 'Italy',
    'indian': 'USA'
}
# Create a new column 'vehicle_make_country' by mapping the 'vehicle_make' column to the dictionary
df['vehicle_make_country'] = df['vehicle_make'].map(make_country)
# Fill any missing values in the new column with 'Other'
df['vehicle_make_country'].fillna('Other', inplace=True)
df['injury_binary'] = df['person_injury_severity'].apply(lambda x: 0 if x == 'n - not injured' else 1)

In [12]:
df

Unnamed: 0,crash_id,person_age,person_ethnicity,person_gender,has_motocycle_endorsment,person_injury_severity,vehicle_body_style,vehicle_color,vehicle_make,vehicle_model,vehicle_model_year,vehicle_make_country,injury_binary
0,16189632,37,w - white,1 - male,0,a - suspected serious injury,mc - motorcycle,blu - blue,harley-davidson,fld,2007,USA,1
1,16203470,30,h - hispanic,1 - male,0,b - suspected minor injury,mc - motorcycle,gry - gray,suzuki,gsx-r600,2004,Japan,1
2,16192023,21,w - white,1 - male,0,a - suspected serious injury,mc - motorcycle,blu - blue,yamaha,yzfr6,2017,Japan,1
3,16196720,18,h - hispanic,1 - male,0,b - suspected minor injury,mc - motorcycle,blu - blue,yamaha,rz500,2002,Japan,1
4,16189103,28,w - white,1 - male,1,b - suspected minor injury,mc - motorcycle,blk - black,harley-davidson,fxdf,2009,USA,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14129,19321499,49,w - white,2 - female,1,b - suspected minor injury,mc - motorcycle,blk - black,yamaha,xvs1100,2013,Japan,1
14130,19323296,33,w - white,1 - male,1,a - suspected serious injury,mc - motorcycle,grn - green,kawasaki,kx250,2019,Japan,1
14131,19327850,35,w - white,1 - male,1,a - suspected serious injury,mc - motorcycle,blk - black,honda,cr250,2016,Japan,1
14132,19330330,42,b - black,2 - female,1,b - suspected minor injury,mc - motorcycle,mul - multicolored,honda,cbr650,2016,Japan,1


<div style='background-color:orange'>
<a id="prepare"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Prepare
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id="misc"></a>
    <h1 style='text-align:center ; top-padding:5px'>
        <b><i>
            Miscellaneous
        </i></b></h1>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>