# Training

***

In [5]:
#Libraries
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from joblib import dump
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from scipy.stats import chi2_contingency
from feature_engine.encoding import OneHotEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

## Enviorment

In [6]:
#Enviorment Tree
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')

if not os.path.exists(MODEL_ARTIFACTS_PATH):
    os.makedirs(MODEL_ARTIFACTS_PATH)
if not os.path.exists(PREDICTOR_DIR_PATH):
    os.makedirs(PREDICTOR_DIR_PATH)

# Load Data

In [7]:
# Reading schema and clasifying
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

In [8]:
#Load data to Dataframe
file_name = [f for f in os.listdir(TRAIN_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TRAIN_DIR, file_name)
df = pd.read_csv(file_path)


## Data Quality Report

In [9]:
def Data_Quality_Report(df):

    # Initial table
    freqDF = pd.DataFrame(columns=['Feature',
                                   'Mode',
                                   'Mode Freq.',
                                   'Mode %',
                                   '2nd Mode',
                                   '2nd Mode Freq.',
                                   '2nd Mode %'])
    for col in df.columns:
        freq = df[col].value_counts()
        freqdf = freq.to_frame()
        fRow = freqdf.iloc[0]
        secRow = freqdf.iloc[1] if len(freqdf) > 1 else pd.Series([0, 0], index=['index', col])
        fPrct = fRow[0] / len(df[col])
        secPrct = secRow[0] / len(df[col]) if len(freqdf) > 1 else 0
        mode1 = fRow.name
        mode2 = secRow.name
        new_row = {'Feature': col,
                   'Mode': mode1,
                   'Mode Freq.': fRow[0],
                   'Mode %': fPrct,
                   '2nd Mode': mode2,
                   '2nd Mode Freq.': secRow[0],
                   '2nd Mode %': secPrct}
        freqDF = pd.concat([freqDF, pd.DataFrame([new_row])], ignore_index=True)
        
    freqDF = freqDF.set_index('Feature')

    # Nulls, Counts, Cardinality
    NUllFeatures = (df.isnull().sum() / df.shape[0]).round(4).sort_values(ascending=False)
    Count = df.count()
    uni = df.nunique()

    # Formatting
    NUllFeatures = NUllFeatures.to_frame(name="% Miss.")
    Count = Count.to_frame(name="Count")
    uni = uni.to_frame(name="Card.")
    result = pd.concat([Count, NUllFeatures, uni, freqDF], axis=1)
    result = result.style.format({'% Miss.': "{:.1%}",
                                  'Mode %': "{:.0%}",
                                  '2nd Mode %': "{:.0%}",
                                  'Count': "{:,}",
                                  'Card.': "{:,}",
                                  'Mode Freq.': "{:,}",
                                  '2nd Mode Freq.': "{:,}"})
    return result

In [10]:
# View Data Quality Report
DQR = Data_Quality_Report(df)
DQR

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
u_id,45286,0.0%,45286,32083,1,0%,22185,1,0%
fatals,45286,0.0%,8,1,43447,96%,2,1581,3%
a_ct,45286,0.0%,1,Single-Vehicle Crash,45286,100%,,0,0%
a_ped_f,45286,0.0%,2,Other Crash,33785,75%,Pedestrian Fatality Involved Crash,11501,25%
a_pedal_f,45286,0.0%,2,Other Crash,43544,96%,Pedalcyclist Fatality Involved Crash,1742,4%
a_roll,45286,0.0%,2,Other Crash,31583,70%,Rollover Involved Crash,13703,30%
a_hr,45286,0.0%,2,No - Hit and Run,43774,97%,Yes - Hit and Run,1512,3%
a_polpur,45286,0.0%,2,Other Crash,44783,99%,Police Pursuit Involved Crash,503,1%
month,45286,0.0%,12,7,4175,9%,10,4110,9%
day,45286,0.0%,31,3,1590,4%,4,1577,3%


In [11]:

#Deop fields woth high MODE
#df = df.drop(columns=['minute','ve_forms','a_ct','a_polpur'])

# Drop Null Fields
df = df.dropna()
DQR = Data_Quality_Report(df)
DQR

Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
u_id,35527,0.0%,35527,32083,1,0%,15545,1,0%
fatals,35527,0.0%,8,1,34034,96%,2,1283,4%
a_ct,35527,0.0%,1,Single-Vehicle Crash,35527,100%,,0,0%
a_ped_f,35527,0.0%,2,Other Crash,26539,75%,Pedestrian Fatality Involved Crash,8988,25%
a_pedal_f,35527,0.0%,2,Other Crash,34167,96%,Pedalcyclist Fatality Involved Crash,1360,4%
a_roll,35527,0.0%,2,Other Crash,24659,69%,Rollover Involved Crash,10868,31%
a_hr,35527,0.0%,2,No - Hit and Run,34684,98%,Yes - Hit and Run,843,2%
a_polpur,35527,0.0%,2,Other Crash,35129,99%,Police Pursuit Involved Crash,398,1%
month,35527,0.0%,12,10,3259,9%,7,3240,9%
day,35527,0.0%,31,1,1259,4%,4,1249,4%


## Process Data

#### Age processing

In [12]:
def bucketize_age(age):
    if age < 20:
        return "Under 20"
    elif age < 30:
        return "20-29"
    elif age < 40:
        return "30-39"
    elif age < 50:
        return "40-49"
    elif age < 60:
        return "50-59"
    else:
        return "60 and above"

In [13]:
def bucketize_death(deaths):
    if deaths > 1:
        return 1
    else:
        return 0

In [14]:
def bucketize_hour(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'

In [15]:
# Apply the bucketizing
df['age_bucket'] = df['age'].apply(bucketize_age)
df['death_bucket'] = df['deaths'].apply(bucketize_death)
df['hour_bucket'] = df['hour'].apply(bucketize_hour)
categorical_features.extend(["age_bucket","death_bucket","hour_bucket"])

### Enconding Data

#### Features

In [16]:
#items to drop
#items_to_remove = ['a_ct', 'a_polpur', 've_forms']
#for item in items_to_remove:
    #categorical_features.remove(item)

# Encoding the features
encoder = OneHotEncoder(top_categories=10)

for cat in categorical_features:
    df[cat] = df[cat].astype(str)

categorical = df[categorical_features]    
encoder.fit(categorical)
categorical_encoded = encoder.transform(categorical)


#### Target

In [17]:
target = df[target_feature]
encoder = LabelEncoder()
y = encoder.fit_transform(target.values.reshape(-1, 1))
dump(encoder, LABEL_ENCODER_FILE)

['/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/Code/fatality_analysis_motor_vehicle/model_inputs_outputs/model/artifacts/label_encoder.joblib']

#### Numeric

In [18]:
#items to drop
"""
items_to_remove = ['minute', 'age', 'mod_year']
for item in items_to_remove:
    numeric_features.remove(item)
"""
numeric = df[numeric_features]
numeric

Unnamed: 0,month,day,hour,minute,age,permvit,pernotmvit,ve_forms,ve_total,mod_year,deaths,numoccs
0,10,2,3.0,10.0,62,1,0,1,1,2003.0,1,1.0
1,6,21,8.0,45.0,40,1,0,1,1,2002.0,1,1.0
3,9,15,20.0,46.0,64,1,1,1,1,1999.0,0,1.0
4,9,28,20.0,24.0,45,1,0,1,1,1996.0,1,1.0
5,8,28,12.0,35.0,79,1,0,1,1,2007.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
45281,3,12,20.0,42.0,24,2,1,1,2,2015.0,0,2.0
45282,5,15,4.0,53.0,18,1,0,1,1,1998.0,1,1.0
45283,12,5,6.0,15.0,43,1,0,1,1,2015.0,1,1.0
45284,5,12,23.0,51.0,21,1,0,1,1,2010.0,1,1.0


## Feature Selection

In [19]:
# Apply Chi-Squared test to each categorical feature
chi2_results = {}
for cat_feature in categorical_features:
    contingency_table = pd.crosstab(df[cat_feature], df[target_feature])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    chi2_results[cat_feature] = {"chi2": chi2, "p_value": p_value}

In [20]:
#Sort the features based on their p-values
sorted_features = sorted(chi2_results.keys(), key=lambda x: chi2_results[x]["p_value"])

# Filter Chi-Squared test results for p-values above 0.05
unsignificant_results = {feature: result for feature, result in chi2_results.items() if result["p_value"] > 0.05}

# Extract unsignificant feature names to a list
unsignificant_results_list = list(unsignificant_results.keys())

In [21]:
unsignificant_results_list

['a_ct']

#### Updated catagorical features

In [22]:
#items to drop
items_to_remove = unsignificant_results_list
for item in items_to_remove:
    try:
        categorical_features.remove(item)
    except:
        pass


# Encoding the features
encoder = OneHotEncoder(top_categories=10)

for cat in categorical_features:
    df[cat] = df[cat].astype(str)

categorical = df[categorical_features]    
encoder.fit(categorical)
categorical_encoded = encoder.transform(categorical)
categorical_encoded

# Saving the encoder to use it on the testing dataset
path = dump(encoder, OHE_ENCODER_FILE)

In [23]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35527 entries, 0 to 45285
Data columns (total 27 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   fatals        35527 non-null  object
 1   a_ped_f       35527 non-null  object
 2   a_pedal_f     35527 non-null  object
 3   a_roll        35527 non-null  object
 4   a_hr          35527 non-null  object
 5   a_polpur      35527 non-null  object
 6   day_week      35527 non-null  object
 7   a_dow_type    35527 non-null  object
 8   a_tod_type    35527 non-null  object
 9   state         35527 non-null  object
 10  a_region      35527 non-null  object
 11  a_ru          35527 non-null  object
 12  a_inter       35527 non-null  object
 13  a_intsec      35527 non-null  object
 14  a_roadfc      35527 non-null  object
 15  a_junc        35527 non-null  object
 16  a_relrd       35527 non-null  object
 17  a_ped         35527 non-null  object
 18  a_body        35527 non-null  object
 19  owner    

## Modeling

In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(categorical_encoded, y, test_size=0.2, random_state=18)


In [25]:
# List of classifier models
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=19)),
    ('Gradient Boosting',GradientBoostingClassifier(learning_rate=0.2,max_depth=4,n_estimators=150)),
    ('LogisticRegression',LogisticRegression(C=100, penalty='l2', solver='liblinear'))
    #('Gradient Boosting', GradientBoostingClassifier(learning_rate: 0.2, max_depth: 4, n_estimators: 150))
    #('SVM', SVC(kernel='linear', C=1.0, random_state=19)),
    ##('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=5)),
    #('MLP', MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=19)),
    #('Naive Bayes', GaussianNB())
]



# Train and evaluate each model
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    print(f"{model_name} - Macro-Averaged F1 Score: {macro_f1:.4f}")

Random Forest - Macro-Averaged F1 Score: 0.5374
Gradient Boosting - Macro-Averaged F1 Score: 0.5777
LogisticRegression - Macro-Averaged F1 Score: 0.5624


## Save Model

In [26]:
model = GradientBoostingClassifier(learning_rate=0.2,max_depth=4,n_estimators=150)
model.fit(X_train, y_train)

# Saving the model to use it for predictions
path = dump(model, PREDICTOR_FILE_PATH)

## Hyperparameter

In [27]:
# Initialize the Logistic Regression model
model = LogisticRegression()

# Define a grid of hyperparameters to search
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization penalty (L1 or L2)
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Initialize GridSearchCV with the Logistic Regression model and parameter grid
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform hyperparameter tuning on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to predict on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")




KeyboardInterrupt: 

In [None]:
# Define parameter grids for each model
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'alpha': [0.0001, 0.001, 0.01]
}

# List of models and corresponding parameter grids
models_params = [
    (RandomForestClassifier(), param_grid_rf),
    (GradientBoostingClassifier(), param_grid_gb),
    (SVC(), param_grid_svm),
    (KNeighborsClassifier(), param_grid_knn),
    (MLPClassifier(max_iter=500), param_grid_mlp),
]



# Perform grid search for each model
for model, param_grid in models_params:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Macro-Averaged F1 Score: {macro_f1:.4f}")

Best parameters for RandomForestClassifier: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Macro-Averaged F1 Score: 0.5432
Best parameters for GradientBoostingClassifier: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 150}
Macro-Averaged F1 Score: 0.5825
Best parameters for SVC: {'C': 10, 'kernel': 'linear'}
Macro-Averaged F1 Score: 0.5524


Traceback (most recent call last):
  File "/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/fars/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/fars/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/fars/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/fars/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/Users/salvadorsanchez/Dropbox/ADS/Hackathon 2023/fars/lib/python3.10/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_

Best parameters for KNeighborsClassifier: {'n_neighbors': 7, 'weights': 'distance'}
Macro-Averaged F1 Score: 0.4793




Best parameters for MLPClassifier: {'alpha': 0.01, 'hidden_layer_sizes': (50,)}
Macro-Averaged F1 Score: 0.5707
