In [2]:
# It's python lib for our boosting model! Install it for the FIRST time only. https://catboost.ai/en/docs/installation/python-installation-method-pip-install
#!pip install catboost 

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

import catboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


In [177]:
print(np.__version__)
print(pd.__version__)
print(catboost.__version__)
!python --version

1.26.4
2.2.1
1.2.3
Python 3.12.2


In [204]:
data = pd.read_csv('../raw_data/data.csv')
data.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0


In [205]:
X = data.drop(['Winner'], axis=1)
y= data.Winner

In [206]:
X = X.replace('NaN', np.nan)

In [207]:
X['date']= pd.to_datetime(X['date'])
X['date'] = X['date'].apply(lambda x: x.timestamp()).astype(int)
X['date'].dtype

dtype('int64')

In [208]:
# Replace non-Red values in Winner-column for 2-class-classification
y = y.apply(lambda x: 1 if x=='Red' else 0)
y.dtype

dtype('int64')

In [209]:
y.value_counts(normalize=True)

Winner
1    0.661843
0    0.338157
Name: proportion, dtype: float64

In [210]:
columns_to_drop = X.isna().sum().sort_values()[-109:].index.to_list()

XX = X.drop(columns=columns_to_drop, axis=1)

categorical_column_names = XX.select_dtypes(include=['object']).columns.to_list()
categorical_indices = [i for i, v in enumerate(XX.columns) if v in categorical_column_names]
categorical_indices

[0, 1, 3, 5]

In [211]:
XX.columns


Index(['R_fighter', 'B_fighter', 'date', 'location', 'title_bout',
       'weight_class', 'B_total_rounds_fought', 'B_total_title_bouts',
       'B_current_win_streak', 'B_current_lose_streak', 'B_longest_win_streak',
       'B_wins', 'B_losses', 'B_draw', 'B_win_by_Decision_Majority',
       'B_win_by_Decision_Split', 'B_win_by_Decision_Unanimous',
       'B_win_by_KO/TKO', 'B_win_by_Submission',
       'B_win_by_TKO_Doctor_Stoppage', 'R_total_rounds_fought',
       'R_total_title_bouts', 'R_current_win_streak', 'R_current_lose_streak',
       'R_longest_win_streak', 'R_wins', 'R_losses', 'R_draw',
       'R_win_by_Decision_Majority', 'R_win_by_Decision_Split',
       'R_win_by_Decision_Unanimous', 'R_win_by_KO/TKO', 'R_win_by_Submission',
       'R_win_by_TKO_Doctor_Stoppage'],
      dtype='object')

In [212]:
X =  X.drop(columns=columns_to_drop, axis=1)


In [213]:
num_preproc = Pipeline([
    ("to_log", FunctionTransformer(np.log)),
    ("num_imputer", SimpleImputer(strategy = "median")),
    ("scaler", RobustScaler())
])

In [214]:
cat_preproc = Pipeline([
    ("cat_imputer", SimpleImputer(strategy = "constant", fill_value="Unknown"))
])
bool_preproc = Pipeline([
    ("bool_imputer", SimpleImputer(strategy = "most_frequent")),
    ("to_str", FunctionTransformer(str))
])

In [215]:
#droper = ColumnTransformer([('column_dropper', 'drop', columns_to_drop)])
"""dropper = ColumnTransformer(
    transformers=[
        ('column_dropper', 'drop', columns_to_drop)
    ] , verbose_feature_names_out=False
)
droper.set_output("pandas")
"""

'dropper = ColumnTransformer(\n    transformers=[\n        (\'column_dropper\', \'drop\', columns_to_drop)\n    ] , verbose_feature_names_out=False\n)\ndroper.set_output("pandas")\n'

In [216]:

preproc =  Pipeline([
    ("col_transformer", ColumnTransformer([
    ("num_tr", num_preproc, make_column_selector(dtype_include = ["float64", "int64"])),
    ("cat_tr", cat_preproc, make_column_selector(dtype_include = ["object"])),
    ("bool_tr", bool_preproc, make_column_selector(dtype_include = ["bool"]))
], remainder="passthrough")
   )#,
#    ("droper", droper) 
])
preproc

In [217]:
cv = StratifiedKFold(n_splits = 5)
model1 = catboost.CatBoostClassifier(n_estimators=2500, depth=5, learning_rate=0.04,silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='AUC')

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                       train_size=0.8, 
                                                       random_state=42, stratify = y)
X_train = pd.DataFrame(data=X_train, columns=X.columns)
X_test = pd.DataFrame(data=X_test, columns=X.columns)

In [219]:
model1_pipe = Pipeline([
    ("preproc", preproc),
    ("model1_classifier", model1)
])
    
model1_pipe

In [220]:
model1_pipe_mean_accuracy = cross_val_score(model1_pipe, X_train, y=y_train, scoring='accuracy', cv=cv).mean()
model1_pipe_mean_accuracy

0.7458914289299305

In [221]:
model1_pipe.fit(X_train,y_train)
y_pred = model1_pipe.predict(X_test)
model1_pipe_mean_test_accuracy = accuracy_score(y_test, y_pred)
model1_pipe_mean_test_accuracy

0.7896924355777224

In [223]:
#Export the fitted pipeline as a pickle file
with open('../models/model1_acc07896.pkl', 'wb') as file:
    pickle.dump(model1_pipe, file)
print("model1_pipe is successfully saved as 'model1_acc07897.pkl'")


model1_pipe is successfully saved as 'model1_acc07897.pkl'


In [203]:
##########################################
### Playgroud to find best params
##########################################

In [59]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [60]:
CBC = catboost.CatBoostClassifier(silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='Logloss')
parameters = {'depth'         : [5,6],
                 'learning_rate' : [0.038, 0.04, 0.042],
                  'n_estimators'    : [1900, 2100, 2500]
                 }

In [61]:
Grid_CBC = GridSearchCV(estimator=CBC, param_grid = parameters, cv = cv, scoring='accuracy', n_jobs=-1)
Grid_CBC.fit(X_train, y_train)

In [62]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",Grid_CBC.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBC.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBC.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7f5909aeeea0>

 The best score across ALL searched params:
 0.7458914289299305

 The best parameters across ALL searched params:
 {'depth': 5, 'learning_rate': 0.04, 'n_estimators': 2500}


In [137]:
best_less_cols_model = catboost.CatBoostClassifier(n_estimators=2500, depth=5, learning_rate=0.04,silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='Logloss')
best_less_cols_pipe = Pipeline([
    ("preproc", preproc),
    ("best_less_cols_classifier", best_less_cols_model)
])
    
best_less_cols_pipe

In [138]:
best_less_cols_pipe_accuracy = cross_val_score(best_less_cols_model, X_train, y=y_train, scoring='accuracy', cv=cv).mean()
best_less_cols_pipe_accuracy

0.7458914289299305

In [139]:
best_less_cols_pipe.fit(X_train,y_train)
best_less_cols_pred = best_less_cols_pipe.predict(X_test)
best_less_col_accuracy = accuracy_score(y_test, best_less_cols_pred)
best_less_col_accuracy

0.7896924355777224

In [66]:
CBC1 = catboost.CatBoostClassifier(silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='Logloss')
parameters = {'depth'         : [4, 5],
                 'learning_rate' : [0.038, 0.04, 0.045],
                  'n_estimators'    : [2300, 2500, 2700]
                 }

In [67]:
Grid_CBC1 = GridSearchCV(estimator=CBC1, param_grid = parameters, cv = cv, scoring='accuracy', n_jobs=-1)
Grid_CBC1.fit(X_train, y_train)

In [68]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",Grid_CBC1.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBC1.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBC1.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7f591ab283b0>

 The best score across ALL searched params:
 0.7458914289299305

 The best parameters across ALL searched params:
 {'depth': 5, 'learning_rate': 0.04, 'n_estimators': 2500}


In [69]:
CBC2 = catboost.CatBoostClassifier(silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='Logloss')
parameters = {'depth'         : [2,3],
                 'learning_rate' : [0.001, 0.04, 0.045],
                  'n_estimators'    : [2500, 3000, 4000]
                 }
Grid_CBC2 = GridSearchCV(estimator=CBC2, param_grid = parameters, cv = cv, scoring='accuracy', n_jobs=-1)
Grid_CBC2.fit(X_train, y_train)

In [70]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",Grid_CBC2.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBC2.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBC2.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7f596d1bfe00>

 The best score across ALL searched params:
 0.7406908950093134

 The best parameters across ALL searched params:
 {'depth': 3, 'learning_rate': 0.045, 'n_estimators': 3000}


In [71]:
CBC3 = catboost.CatBoostClassifier(silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='Logloss')
parameters = {'depth'         : [6, 7],
                 'learning_rate' : [0.04],
                  'n_estimators'    : [2500, 3000]
                 }
Grid_CBC3 = GridSearchCV(estimator=CBC3, param_grid = parameters, cv = cv, scoring='accuracy', n_jobs=-1)
Grid_CBC3.fit(X_train, y_train)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",Grid_CBC3.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBC3.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBC3.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x7f5909aee210>

 The best score across ALL searched params:
 0.7390283423582071

 The best parameters across ALL searched params:
 {'depth': 6, 'learning_rate': 0.04, 'n_estimators': 2500}


In [73]:
CBC3.fit(X_train,y_train)
CBC3_pred = best_less_cols_pipe.predict(X_test)
CBC3_pred_accuracy = accuracy_score(y_test, CBC3_pred)
CBC3_pred_accuracy

0.7896924355777224