# Capstone Two: Preprocessing and Training Data Development

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import preprocessing
import scipy.stats as stats
import folium

from library.sb_utils import save_file

In [2]:
df = pd.read_csv('../data/traffic_data_step3_features.csv')


# Creating dummy features

In [4]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Print the list of categorical columns
print("Categorical columns:")
print(cat_cols)


Categorical columns:
['CRASH_RECORD_ID', 'CRASH_DATE', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'DAMAGE', 'DATE_POLICE_NOTIFIED', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'STREET_NAME', 'MOST_SEVERE_INJURY', 'LOCATION']


In [5]:
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
le = LabelEncoder()

# Loop over each categorical variable and encode its values
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [6]:
df.head()

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,...,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION,CRASH_YEAR
0,129328,72683,30,16,1,2,2,17,10,3,...,1.0,2.0,0.0,14,2,3,41.884547,-87.641201,29035,2019
1,127783,164940,30,4,3,2,2,0,8,3,...,0.0,2.0,0.0,8,4,9,41.968562,-87.740659,93839,2018
2,2664,121108,30,17,6,2,4,0,8,3,...,0.0,2.0,0.0,0,6,7,41.886336,-87.716203,80760,2022
3,14495,147605,30,16,1,2,2,10,6,3,...,0.0,3.0,0.0,11,2,8,41.749348,-87.721097,83491,2022
4,3569,121243,30,16,1,2,2,10,8,3,...,0.0,2.0,0.0,18,6,7,41.925111,-87.667997,48820,2022


# Scale: standardization

In [7]:
from sklearn.preprocessing import StandardScaler
# Select the numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a scaler object
scaler = StandardScaler()

# Standardize the numeric columns


df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Print the first 5 rows of the standardized dataframe
print(df.head(5))

   CRASH_RECORD_ID  CRASH_DATE  POSTED_SPEED_LIMIT  TRAFFIC_CONTROL_DEVICE  \
0           129328       72683            0.263187                      16   
1           127783      164940            0.263187                       4   
2             2664      121108            0.263187                      17   
3            14495      147605            0.263187                      16   
4             3569      121243            0.263187                      16   

   DEVICE_CONDITION  WEATHER_CONDITION  LIGHTING_CONDITION  FIRST_CRASH_TYPE  \
0                 1                  2                   2                17   
1                 3                  2                   2                 0   
2                 6                  2                   4                 0   
3                 1                  2                   2                10   
4                 1                  2                   2                10   

   TRAFFICWAY_TYPE  ALIGNMENT  ...  INJURIES_REPOR

# Split data into training and testing subsets

In [8]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = df.drop('INJURIES_INCAPACITATING', axis=1) # all columns except the target
y = df['INJURIES_INCAPACITATING'] # target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)