# Predict

***

## Libraries

In [1]:
import os
import pandas as pd
import json
import warnings
from joblib import load
warnings.filterwarnings('ignore')


In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR,  'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
PREDICTIONS_DIR = os.path.join(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE = os.path.join(PREDICTIONS_DIR, 'predictions.csv')
if not os.path.exists(PREDICTIONS_DIR):
    os.makedirs(PREDICTIONS_DIR)

## Load Data

In [3]:
file_name = [f for f in os.listdir(INPUT_SCHEMA_DIR) if f.endswith('.json')][0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schema = json.load(file)
features = schema['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schema['id']['name']
target_feature = schema['target']['name']

In [4]:
#load testing data
file_name = [f for f in os.listdir(TEST_DIR) if f.endswith('.csv')][0]
file_path = os.path.join(TEST_DIR, file_name)
df = pd.read_csv(file_path)
ids = df[id_feature]
df = df.dropna()
df.head(3)

Unnamed: 0,u_id,fatals,a_ct,a_ped_f,a_pedal_f,a_roll,a_hr,a_polpur,month,day,...,a_body,owner,deaths,numoccs,impact1,deformed,ve_forms,ve_total,weather,lgt_cond
2,24020,1,Single-Vehicle Crash,Pedestrian Fatality Involved Crash,Other Crash,Other Crash,Yes - Hit and Run,Other Crash,10,15,...,Van-Based Light Trucks,Driver (in this crash) Not Registered Owner (o...,0,1.0,Clockpoint 1,Minor damage,1,1,Clear,Dark - not lighted
5,53521,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,12,28,...,Light Conventional Trucks,Driver (in this crash) Was Registered Owner,1,3.0,Clockpoint 12,Disabling damage,1,1,Clear,Dark - not lighted
6,21812,1,Single-Vehicle Crash,Pedestrian Fatality Involved Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,6,8,...,Automobiles,Driver (in this crash) Not Registered Owner (o...,0,2.0,Clockpoint 12,Functional damage,1,1,Cloudy,Dark - not lighted


## Process Data

In [5]:
def bucketize_age(age):
    if age < 20:
        return "Under 20"
    elif age < 30:
        return "20-29"
    elif age < 40:
        return "30-39"
    elif age < 50:
        return "40-49"
    elif age < 60:
        return "50-59"
    else:
        return "60 and above"

def bucketize_death(deaths):
    if deaths > 1:
        return 1
    else:
        return 0
    
def bucketize_hour(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
    
# Apply the bucketizing
df['age_bucket'] = df['age'].apply(bucketize_age)
df['death_bucket'] = df['deaths'].apply(bucketize_death)
df['hour_bucket'] = df['hour'].apply(bucketize_hour)
categorical_features.extend(["age_bucket","death_bucket","hour_bucket"])

## Encoding

In [6]:
#Clean DF
if 'a_ct' in categorical_features:
    categorical_features.remove('a_ct')


In [7]:
# Encoding
encoder = load(OHE_ENCODER_FILE)

categorical = df[categorical_features]  
categorical_encoded = encoder.transform(categorical)

## Make Prediction

In [8]:
model = load(PREDICTOR_FILE_PATH)
predictions = model.predict_proba(categorical_encoded)
predictions

array([[0.00962203, 0.96880699, 0.02157098],
       [0.58022917, 0.30419633, 0.1155745 ],
       [0.06398208, 0.91007423, 0.0259437 ],
       ...,
       [0.01480143, 0.97177626, 0.01342231],
       [0.03306181, 0.92806327, 0.03887491],
       [0.3650883 , 0.11814448, 0.51676721]])

In [9]:
encoder = load(LABEL_ENCODER_FILE)

class_names = encoder.inverse_transform([0, 1, 2])

predictions = pd.DataFrame(predictions, columns=class_names)
predictions.insert(0, 'u_id', ids)
predictions.to_csv(PREDICTIONS_FILE)
predictions

Unnamed: 0,u_id,drunk_driver_involved,other,speeding_driver_involved
0,41633,0.009622,0.968807,0.021571
1,38966,0.580229,0.304196,0.115575
2,24020,0.063982,0.910074,0.025944
3,52280,0.021679,0.966159,0.012162
4,47480,0.342108,0.489218,0.168674
...,...,...,...,...
8846,27907,0.290417,0.646661,0.062922
8847,52336,0.094567,0.855478,0.049955
8848,53954,0.014801,0.971776,0.013422
8849,17276,0.033062,0.928063,0.038875
