In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import  StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import pickle

In [2]:
df = pd.read_csv("../data/processed/gladiador_data_procesado.csv")
df.head()

Unnamed: 0,Wins,Public Favor,Survived,Allegiance Network_Strong
0,11,0.841842,False,1
1,7,0.651044,True,0
2,6,0.593816,True,1
3,6,0.540815,False,1
4,12,0.761651,False,1


In [3]:
df['Survived'] = df['Survived'].astype(int)
df.head()

Unnamed: 0,Wins,Public Favor,Survived,Allegiance Network_Strong
0,11,0.841842,0,1
1,7,0.651044,1,0
2,6,0.593816,1,1
3,6,0.540815,0,1
4,12,0.761651,0,1


## MODELO RANDOM FOREST CLASSIFIER

In [6]:
X = df[["Wins", "Public Favor", "Allegiance Network_Strong"]]
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7980, 3)
(7980,)
(1996, 3)
(1996,)


In [7]:
cv= KFold(10)
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, max_features=3, random_state=42)
rfc.fit(X_train, y_train)
cv_rfc = cross_val_score(estimator=rfc, cv=cv, X= X, y= y, scoring="accuracy")

In [8]:
print(cv_rfc)

[0.71943888 0.73947896 0.72144289 0.74849699 0.73547094 0.7254509
 0.72016048 0.72316951 0.72116349 0.72517553]


In [9]:
print(cv_rfc.max())

0.748496993987976


In [10]:
print(cv_rfc.mean())

0.7279448566139299


In [11]:
#Realizo Predicciones

# Predicciones sobre el mismo conjunto de datos de entrenamiento
pred_rfc = rfc.predict(X_test)

In [12]:
#Obtengo las PROBABILIDADES de predicción
pred_proba = rfc.predict_proba(X_test)[:, 1]

In [13]:
print("--- Métricas de Clasificación ---")
print("Precisión (Accuracy): ", accuracy_score(y_test, pred_rfc))
print("Precisión (Clase 1): ", precision_score(y_test, pred_rfc))
print("Sensibilidad (Recall - Clase 1): ", recall_score(y_test, pred_rfc))
print("Puntuación F1 (Clase 1): ", f1_score(y_test, pred_rfc))
# Si pred_dtc son probabilidades, podrías calcular el ROC AUC:
print("ROC AUC: ", roc_auc_score(y_test, pred_proba))

--- Métricas de Clasificación ---
Precisión (Accuracy):  0.7289579158316634
Precisión (Clase 1):  0.6889460154241646
Sensibilidad (Recall - Clase 1):  0.8187372708757638
Puntuación F1 (Clase 1):  0.7482550023266635
ROC AUC:  0.7977259306571542


In [14]:
# También puedes predecir para un nuevo dato, por ejemplo:
# Un personaje con 8 victorias, 0.7 de favor público y una red de lealtad fuerte (1)
new_data = pd.DataFrame([[8, 0.7, 1]], columns=["Wins", "Public Favor", "Allegiance Network_Strong"])
new_prediction = rfc.predict(new_data)

print(f"\nPredicción para un nuevo personaje (Wins: 8, Public Favor: 0.7, Allegiance Network_Strong: 1): {new_prediction[0]}")

if new_prediction[0] == 1:
    print("El modelo predice que este personaje Sobreviviría.")
else:
    print("El modelo predice que este personaje No Sobreviviría.")


Predicción para un nuevo personaje (Wins: 8, Public Favor: 0.7, Allegiance Network_Strong: 1): 1
El modelo predice que este personaje Sobreviviría.


In [15]:
#Creo un DataFrame completo de entrenamiento combinando X_train y y_train
train_df = pd.concat([X_train, y_train], axis=1)

#Creo un DataFrame completo de prueba combinando X_test y y_test
test_df = pd.concat([X_test, y_test], axis=1)

# Guardar los DataFrames en archivos CSV
train_df.to_csv("../data/train/train_data.csv", index=False)
test_df.to_csv("../data/test/test_data.csv", index=False)

In [16]:
with open("../models/rfc_model.pkl", "wb") as f:
    pickle.dump(rfc, f)