In [1]:
import numpy as np 
import pandas as pd
from helpers import * 
import warnings
from pandas.errors import SettingWithCopyWarning
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)
ufc = pd.read_csv('ufc-master.csv')

AD = data_prep_and_feat_engineering(ufc, cat_thresh=0.001, squared_thresh=0.0625)
best_approach = ufc.copy() ## creating a new copy of the data to manipulate
best_cols = AD['approach 6'][1] ## features from approach 6
best_approach, best_cols = performance_index(best_approach, best_cols, diff=True) ## creating performance index difference variable

best_approach = best_approach.dropna(subset=best_cols)

best_feats = best_approach[best_cols]
targ = [0 if victor == 'Red' else 1 for victor in best_approach['Winner']]

best_feats_rs, best_targ_rs = resample_dataframe(best_feats, targ)

le = LabelEncoder()
num, cat = num_and_cat(best_feats_rs)

for col in cat:
    best_feats_rs[col] = le.fit_transform(best_feats_rs[col])


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(best_feats_rs, best_targ_rs, test_size=0.2, random_state=42)

# Train a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Test thresholds to find the best one
thresholds = np.arange(0, 1.1, 0.1)
best_f1_score = 0
best_threshold = 0

for threshold in thresholds:
    y_pred_thresholded = (rf_classifier.predict_proba(X_test)[:,1] > threshold).astype(int)
    f1 = f1_score(y_test, y_pred_thresholded)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print("Best F1 Score:", best_f1_score)
print("Best Threshold:", best_threshold)

Confusion Matrix:
[[364 101]
 [ 83 367]]
Best F1 Score: 0.8111380145278451
Best Threshold: 0.6000000000000001


In [4]:
new_rf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2)

new_rf.fit(X_train, y_train)

new_rf_probs = new_rf.predict_proba(X_test)

y_pred_thresholded = (new_rf_probs[:, 1] > best_threshold).astype(int)

In [5]:
from sklearn.metrics import accuracy_score, precision_score

accuracy_score(y_pred_thresholded, y_test)

0.8284153005464481

In [6]:
precision_score(y_test, y_pred_thresholded)

0.9013698630136986