# Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Load Data

In [3]:
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')
y_test = np.load('y_test.npy')

df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')
df_test = pd.read_csv('df_test.csv')

# Train Vectorizer

In [4]:
dv = DictVectorizer(sparse=True)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Decision Tree

In [5]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)

y_pred_proba = dt.predict_proba(X_val)[:,1]

# set the threshold
threshold = 0.5

# convert probabilities to class predictions
y_pred = (y_pred_proba >= threshold).astype(int)

f1_rounded = round(f1_score(y_val, y_pred), 3)
precision_rounded = round(precision_score(y_val, y_pred), 3)
recall_rounded = round(recall_score(y_val, y_pred), 3)
print("f-1 score:", f1_rounded)
print("precision score:", precision_rounded)
print("recall score:", recall_rounded)

f-1 score: 0.429
precision score: 0.45
recall score: 0.409


# Random Forest

In [6]:
rf = RandomForestClassifier(random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred_proba = rf.predict_proba(X_val)[:,1]

# set the threshold
threshold = 0.085

# convert probabilities to class predictions
y_pred = (y_pred_proba >= threshold).astype(int)

f1_rounded = round(f1_score(y_val, y_pred), 3)
precision_rounded = round(precision_score(y_val, y_pred), 3)
recall_rounded = round(recall_score(y_val, y_pred), 3)
print("f-1 score:", f1_rounded)
print("precision score:", precision_rounded)
print("recall score:", recall_rounded)

f-1 score: 0.306
precision score: 0.197
recall score: 0.682


# XGBoost

In [7]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [124]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [125]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=200,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-logloss:0.26694	val-logloss:0.28041
[5]	train-logloss:0.16287	val-logloss:0.22944


[10]	train-logloss:0.11860	val-logloss:0.20337
[15]	train-logloss:0.09699	val-logloss:0.20093
[20]	train-logloss:0.08547	val-logloss:0.19647
[25]	train-logloss:0.07239	val-logloss:0.19343
[30]	train-logloss:0.06415	val-logloss:0.19017
[35]	train-logloss:0.05591	val-logloss:0.19370
[40]	train-logloss:0.04914	val-logloss:0.19028
[45]	train-logloss:0.04396	val-logloss:0.19212
[50]	train-logloss:0.04090	val-logloss:0.19056
[55]	train-logloss:0.03731	val-logloss:0.19645
[60]	train-logloss:0.03423	val-logloss:0.20221
[65]	train-logloss:0.03202	val-logloss:0.20567
[70]	train-logloss:0.02988	val-logloss:0.20938
[75]	train-logloss:0.02786	val-logloss:0.21293
[80]	train-logloss:0.02600	val-logloss:0.21596
[85]	train-logloss:0.02479	val-logloss:0.22009
[90]	train-logloss:0.02352	val-logloss:0.22489
[95]	train-logloss:0.02240	val-logloss:0.22801
[100]	train-logloss:0.02144	val-logloss:0.23167
[105]	train-logloss:0.02068	val-logloss:0.23432
[110]	train-logloss:0.02003	val-logloss:0.23473
[115]	trai

In [126]:
y_pred_proba = model.predict(dval)

# set the threshold
threshold = 0.057

# convert probabilities to class predictions
y_pred = (y_pred_proba >= threshold).astype(int)

f1_rounded = round(f1_score(y_val, y_pred), 3)
precision_rounded = round(precision_score(y_val, y_pred), 3)
recall_rounded = round(recall_score(y_val, y_pred), 3)
print("f-1 score:", f1_rounded)
print("precision score:", precision_rounded)
print("recall score:", recall_rounded)

f-1 score: 0.5
precision score: 0.462
recall score: 0.545
