In [67]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [39]:
df = pd.read_csv("../data/processed/gladiador_data_procesado.csv")
df.head()

Unnamed: 0,Wins,Public Favor,Survived,Allegiance Network_Strong
0,11,0.841842,False,1
1,7,0.651044,True,0
2,6,0.593816,True,1
3,6,0.540815,False,1
4,12,0.761651,False,1


In [40]:
df['Survived'] = df['Survived'].astype(int)
df.head()

Unnamed: 0,Wins,Public Favor,Survived,Allegiance Network_Strong
0,11,0.841842,0,1
1,7,0.651044,1,0
2,6,0.593816,1,1
3,6,0.540815,0,1
4,12,0.761651,0,1


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9976 entries, 0 to 9975
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Wins                       9976 non-null   int64  
 1   Public Favor               9976 non-null   float64
 2   Survived                   9976 non-null   int64  
 3   Allegiance Network_Strong  9976 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 311.9 KB


## MODELO BAGGING CLASSIFIER

In [42]:
X = df[["Wins", "Public Favor", "Allegiance Network_Strong"]]
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7980, 3)
(7980,)
(1996, 3)
(1996,)


In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
estimator = DecisionTreeClassifier(max_depth=3, random_state=42)

bag_clf = BaggingClassifier(
    estimator = estimator,
    n_estimators=300, # Cantidad de modelos
    max_samples=100, # Muestras utilizadas en boostrapping
    bootstrap=True, # Usamos boostrapping
    max_features = 3, # Features que utiliza en el boostrapping. Cuanto más bajo, mejor generalizará y menos overfitting
    random_state=42)


bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7224448897795591

In [45]:
cv= KFold(10)
cv_bc = cross_val_score(estimator=estimator, X=X, y=y, cv=cv, scoring="accuracy")

In [46]:
print(cv_bc.max())

0.748496993987976


# MODELO RANDOM FOREST

In [47]:
cv= KFold(10)
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, max_features=3, random_state=42)
rfc.fit(X_train, y_train)
cv_rfc = cross_val_score(estimator=rfc, cv=cv, X= X, y= y, scoring="accuracy")

In [48]:
print(cv_rfc)

[0.71943888 0.73947896 0.72144289 0.74849699 0.73547094 0.7254509
 0.72016048 0.72316951 0.72116349 0.72517553]


In [49]:
print(cv_rfc.max())

0.748496993987976


In [50]:
print(cv_rfc.mean())

0.7279448566139299


# MODELO ADABOST CLASSIFIER

In [51]:
estimator = DecisionTreeClassifier(max_depth=1)

ada_clf = AdaBoostClassifier(estimator = estimator,
                             n_estimators=200,
                             learning_rate=0.5, #va de 0 a 1 /es cuanto e tiene en cuenta las predicciones del valor anterior
                             random_state=42)

cv_ada = cross_val_score(estimator=ada_clf, X=X, y=y, cv=10, scoring="accuracy")

In [52]:
print(cv_ada)
print(cv_ada.mean())

[0.68937876 0.71943888 0.7244489  0.73446894 0.71643287 0.71042084
 0.69809428 0.70712136 0.73420261 0.7111334 ]
0.7145140833321608


# MODELO GRADIENT BOOSTING CLASSIFIER

In [53]:
gbc = GradientBoostingClassifier(max_depth=5,n_estimators=100, random_state=42, learning_rate=0.5)
cv_gbc = cross_val_score(estimator=gbc, X=X, y=y,cv=10,scoring="accuracy")

In [54]:
print(cv_gbc)
print(cv_gbc.mean())

[0.67635271 0.70240481 0.70240481 0.69338677 0.70240481 0.69038076
 0.69709127 0.70110331 0.69007021 0.70310933]
0.6958708791705778


# MODELO XGB CLASSIFIER

In [55]:
xgb = XGBClassifier(n_estimators = 100, random_state = 10, learning_rate = 0.5, eval_metric='logloss')
cv_xgb = cross_val_score(estimator=xgb, X=X, y=y, cv= 10, scoring="accuracy")

In [56]:
print(cv_xgb)
print(cv_xgb.mean())

[0.69138277 0.7244489  0.70440882 0.71543086 0.7004008  0.71242485
 0.71013039 0.70511535 0.70310933 0.69809428]
0.7064946342032108


# MODELO REGRESION LOGISTICA

In [57]:
lr_model = LogisticRegression(random_state=42, solver='liblinear')
cv_lr = cross_val_score(estimator=lr_model, X=X, y=y, cv= 10, scoring="accuracy")

In [58]:
print(cv_lr)
print(cv_lr.mean())

[0.70541082 0.73146293 0.73446894 0.73647295 0.72244489 0.73146293
 0.71213641 0.71013039 0.73219659 0.71915747]
0.7235344309481551


# MODELO DECISSION TREE CLASSIFIER

In [63]:
dtc_model = DecisionTreeClassifier(random_state=42)
cv_dtc = cross_val_score(estimator=dtc_model, cv=10, X= X, y= y, scoring="accuracy")

In [64]:
print(cv_dtc)
print(cv_dtc.mean())

[0.6492986  0.64428858 0.64529058 0.64729459 0.65931864 0.64829659
 0.65496489 0.64192578 0.64994985 0.65496489]
0.6495592991399046


### PRIMEROS RESULTADOS

In [66]:
modelos = [cv_bc.mean(), cv_rfc.mean(), cv_ada.mean(), cv_gbc.mean(), cv_xgb.mean(), cv_lr.mean(), cv_dtc.mean()]
df_resultados = pd.DataFrame(modelos, columns=["Accuracy"], index=["BaggingClassifier", "RandomForestClassifier", "AdaBoostClassifier", "GradientBoostingClassifier", "XGBClassifier", "LogisticRegression", "DecissionTreeClassifier"])

In [61]:
df_resultados.sort_values("Accuracy", ascending=False)

Unnamed: 0,Accuracy
RandomForestClassifier,0.727945
BaggingClassifier,0.725138
LogisticRegression,0.723534
AdaBoostClassifier,0.714514
XGBClassifier,0.706495
GradientBoostingClassifier,0.695871


### HIPERPARAMETRIZACION

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7980, 3)
(7980,)
(1996, 3)
(1996,)


In [36]:
rfc = RandomForestClassifier()
parametros={
    "n_estimators":[50,100,150],
    "max_depth":[3,5,7,10,12],
    "max_features":[2,3,4],
    "bootstrap":[True, False]
}

gs_rfc = GridSearchCV(rfc, parametros, scoring="accuracy", cv = 5, verbose=3, n_jobs=1)
gs_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=50;, score=0.727 total time=   0.3s
[CV 2/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=50;, score=0.731 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=50;, score=0.722 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=50;, score=0.713 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=50;, score=0.744 total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=100;, score=0.727 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=100;, score=0.731 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=100;, score=0.723 total time=   0.3s
[CV 4/5] END bootstrap=True, max_depth=3, max_features=2, n_estimators=100;, sc

In [37]:
print(gs_rfc.best_estimator_)
print(gs_rfc.best_score_)

RandomForestClassifier(max_depth=5, max_features=2, n_estimators=150)
0.7308270676691728


## PIPELINE

In [71]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", BaggingClassifier())

])

parametros = [
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model": [BaggingClassifier(random_state=42)],
        "model__n_estimators":[50,100,150],
        "model__max_samples": [0.5, 1]
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model": [RandomForestClassifier(random_state=42)],
        "model__n_estimators": [50, 100, 150],
        "model__max_depth": [3, 5, 7, 10, 12]
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model": [AdaBoostClassifier(random_state=42)],
        "model__n_estimators":[50,100,150],
        "model__learning_rate":[0.5, 0.75]
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model": [GradientBoostingClassifier(random_state=42)],
        "model__n_estimators":[50,100,150],
        "model__learning_rate":[0.5, 0.75]
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model":[XGBClassifier(random_state=42)],
        "model__n_estimators":[50, 100, 150],
        "model__learning_rate":[0.5, 0.75],
        "model__max_depth":[3,5,7,12]
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model":[LogisticRegression(random_state=42)],
    },
    {
        "scaler":[None, StandardScaler(), MinMaxScaler()],
        "model":[DecisionTreeClassifier(random_state=42)],
        "model__max_depth": [3, 5, 7, 10, 12]
    }
]

gs_final = GridSearchCV(pipeline, parametros, cv = 10, scoring="accuracy", n_jobs=1, verbose=3)
gs_final.fit(X_train, y_train)

Fitting 10 folds for each of 189 candidates, totalling 1890 fits
[CV 1/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.679 total time=   0.3s
[CV 2/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.688 total time=   0.4s
[CV 3/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.672 total time=   0.4s
[CV 4/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.673 total time=   0.3s
[CV 5/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.693 total time=   0.4s
[CV 6/10] END model=BaggingClassifier(random_state=42), model__max_samples=0.5, model__n_estimators=50, scaler=None;, score=0.662 total time=   0.4s
[CV 7/10] END model=BaggingClassifier(ran