In [None]:
# It's python lib for our boosting model! Install it for the FIRST time only. https://catboost.ai/en/docs/installation/python-installation-method-pip-install
#!pip install catboost 

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

import catboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


In [3]:
print(np.__version__)
print(pd.__version__)
print(catboost.__version__)
!python --version

1.26.4
2.2.1
1.2.3
Python 3.12.2


In [4]:
data = pd.read_csv('../raw_data/data.csv')
data.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Bantamweight,0.0,0.0,...,0,1,0,0,Orthodox,170.18,177.8,135.0,31.0,27.0
1,Trevin Giles,Roman Dolidze,Herb Dean,2021-03-20,"Las Vegas, Nevada, USA",Red,False,Middleweight,0.5,0.0,...,0,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0


In [5]:
X = data.drop(['Winner'], axis=1)
y= data.Winner

In [6]:
X = X.replace('NaN', np.nan)

In [7]:
X['date']= pd.to_datetime(X['date'])
X['date'] = X['date'].apply(lambda x: x.timestamp()).astype(int)
X['date'].dtype

dtype('int64')

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6012 entries, 0 to 6011
Columns: 143 entries, R_fighter to R_age
dtypes: bool(1), float64(106), int64(29), object(7)
memory usage: 6.5+ MB


In [9]:
col_names = X.columns.to_list()
categorical_column_names = X.select_dtypes(include=['object']).columns.to_list()
categorical_indices = [col_names.index(i) for i in categorical_column_names]
categorical_indices

[0, 1, 2, 4, 6, 70, 137]

In [10]:
categorical_column_names

['R_fighter',
 'B_fighter',
 'Referee',
 'location',
 'weight_class',
 'B_Stance',
 'R_Stance']

In [11]:
for col in categorical_column_names:
    X[col] = X[col].fillna('Unknown')
    X[col] = X[col].apply(str)

In [12]:
num_column_names = X.select_dtypes(include=['number']).columns.to_list()
for col in num_column_names:
    X[col] = X[col].apply(np.log)
    X[col] = X[col].fillna(X[col].median())

In [13]:
X.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,21.203343,"Las Vegas, Nevada, USA",False,Bantamweight,-inf,-inf,-0.867501,...,-inf,0.0,-inf,-inf,Orthodox,5.136857,5.180659,4.905275,3.433987,3.295837
1,Trevin Giles,Roman Dolidze,Herb Dean,21.203343,"Las Vegas, Nevada, USA",False,Middleweight,-0.693147,-inf,-0.415515,...,-inf,1.098612,-inf,-inf,Orthodox,5.20883,5.236229,5.220356,3.465736,3.332205


In [14]:
y.value_counts(normalize=True)

Winner
Red     0.661843
Blue    0.319860
Draw    0.018297
Name: proportion, dtype: float64

In [15]:
# Replace non-Red values in Winner-column for 2-class-classification
y = y.apply(lambda x: 'Red' if x=='Red' else 'noRed').apply(str)
y.dtype

dtype('O')

In [16]:
y.value_counts(normalize=True)

Winner
Red      0.661843
noRed    0.338157
Name: proportion, dtype: float64

In [17]:
X.head(2)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Adrian Yanez,Gustavo Lopez,Chris Tognoni,21.203343,"Las Vegas, Nevada, USA",False,Bantamweight,-inf,-inf,-0.867501,...,-inf,0.0,-inf,-inf,Orthodox,5.136857,5.180659,4.905275,3.433987,3.295837
1,Trevin Giles,Roman Dolidze,Herb Dean,21.203343,"Las Vegas, Nevada, USA",False,Middleweight,-0.693147,-inf,-0.415515,...,-inf,1.098612,-inf,-inf,Orthodox,5.20883,5.236229,5.220356,3.465736,3.332205


In [18]:
X['date'].dtype

dtype('float64')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                       train_size=0.8, 
                                                       random_state=42, stratify = y)


In [20]:
X_train = pd.DataFrame(data=X_train, columns=X.columns)
X_test = pd.DataFrame(data=X_test, columns=X.columns)

In [21]:
cv = StratifiedKFold(n_splits = 5)
from_box_model = catboost.CatBoostClassifier(n_estimators=200, silent=True,
                                             cat_features=categorical_indices,
                                             eval_metric='AUC')

In [22]:
num_preproc = Pipeline([
    ("num_imputer", SimpleImputer(strategy = "median")),
    ("scaler", RobustScaler())
])

In [23]:
cat_preproc = Pipeline([
    ("cat_imputer", SimpleImputer(strategy = "constant", fill_value="Unknown"))
])
bool_preproc = Pipeline([
    ("bool_imputer", SimpleImputer(strategy = "most_frequent")),
    ("to_str", FunctionTransformer(str))
])

In [24]:
preproc = ColumnTransformer([
    ("num_tr", num_preproc, make_column_selector(dtype_include = ["float64", "int64"])),
    ("cat_tr", cat_preproc, make_column_selector(dtype_include = ["object"])),
    ("bool_tr", bool_preproc, make_column_selector(dtype_include = ["bool"]))
], remainder="passthrough")

In [25]:
from_box_pipe = Pipeline([
    ("preproc", preproc),
    ("from_box_classifier", from_box_model)
])
    
from_box_pipe

In [26]:
mean_accuracy = cross_val_score(from_box_pipe, X_train, y=y_train, scoring='accuracy', cv=cv).mean()
mean_accuracy

0.7192734958603845

In [27]:
from_box_pipe.fit(X_train,y_train)
y_pred = from_box_pipe.predict(X_test)

In [28]:
from_box_accuracy = accuracy_score(y_test, y_pred)
from_box_accuracy

0.742310889443059

In [29]:
#Export the fitted pipeline as a pickle file
with open('../models/from_box_model_acc074.pkl', 'wb') as file:
    pickle.dump(from_box_pipe, file)
print("from_box_pipe is successfully saved as 'from_box_model_acc074.pkl'")

from_box_pipe is successfully saved as 'from_box_model_acc074.pkl'
