In [2]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('../../Leagues/1st_2nd_tiers_top_5_leagues.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,year,league,tier,team,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
0,2010,Premier-League,1,Chelsea FC,33,25.9,23,12850000,424100000,False,False,True
1,2010,Premier-League,1,Manchester City,45,24.9,28,8980000,404180000,False,False,False
2,2010,Premier-League,1,Manchester United,43,25.7,29,9020000,388000000,False,False,False
3,2010,Premier-League,1,Liverpool FC,42,24.9,25,8080000,339200000,False,False,False
4,2010,Premier-League,1,Arsenal FC,34,24.8,27,9530000,324000000,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
3034,2024,Ligue-2,2,Red Star FC,28,26.7,10,371000,10400000,False,True,False
3035,2024,Ligue-2,2,Stade Lavallois,24,28.3,11,427000,10250000,False,False,False
3036,2024,Ligue-2,2,AC Ajaccio,26,27.0,11,390000,10150000,False,False,False
3037,2024,Ligue-2,2,FC Annecy,22,25.9,6,357000,7850000,False,False,False


In [4]:
df['has_relegated'].value_counts()

has_relegated
False    2844
True      195
Name: count, dtype: int64

In [5]:
X = df.drop(columns=['league', 'year', 'team', 'has_relegated'])  # Elimina la columna objetivo del conjunto de características
y = df['has_relegated']  # Define la variable objetivo

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337, stratify=y)

In [6]:
clf = RandomForestClassifier(criterion='gini', max_depth=3, random_state=1337, class_weight='balanced')
clf

In [7]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [11]:
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df[['precision', 'recall', 'f1-score', 'support']])

              precision    recall  f1-score     support
False          0.995283  0.741652  0.849950  569.000000
True           0.201087  0.948718  0.331839   39.000000
accuracy       0.754934  0.754934  0.754934    0.754934
macro avg      0.598185  0.845185  0.590894  608.000000
weighted avg   0.944340  0.754934  0.816716  608.000000


TESTS BELOW

In [40]:
criterios = ['gini', 'entropy', 'log_loss']
profundidades = [3, 5, 10, 15, 25, 50, 100, 500]
mejor_clf = {'profundidad':'',
              'criterio':'',
              'accuracy':'',
              'precision':'',
              'recall':'',
              'f1':''}
mejor_recall = 0

# Asignamos un peso mayor para la clase 'False' y un peso menor para la clase 'True'
pesos_personalizados = {0: 5, 1: 1}

In [41]:
for criterio in tqdm(criterios):
    for profundidad in profundidades:
        # Crear el modelo de RandomForest
        rfc = RandomForestClassifier(criterion=criterio, max_depth=profundidad, random_state=1337, class_weight='balanced_subsample')
        
        # Entrenar el modelo
        rfc.fit(X_train, y_train)
        
        # Realizar las predicciones
        predicciones = rfc.predict(X_test)
        
        # Calcular las métricas
        acc = accuracy_score(y_test, predicciones)
        prec = precision_score(y_test, predicciones)
        rec = recall_score(y_test, predicciones)
        f1 = f1_score(y_test, predicciones)
        
        # Filtrar solo si la precisión es mayor o igual al 25%
        if prec >= 0.25:
            # Si el recall es el mejor hasta ahora, guardamos el modelo
            if rec >= mejor_recall:
                mejor_recall = rec
                mejor_clf['profundidad'] = profundidad
                mejor_clf['criterio'] = criterio
                mejor_clf['accuracy'] = acc
                mejor_clf['precision'] = prec
                mejor_clf['recall'] = rec
                mejor_clf['f1'] = f1

100%|██████████| 3/3 [00:05<00:00,  1.88s/it]


In [42]:
mejor_clf

{'profundidad': 10,
 'criterio': 'log_loss',
 'accuracy': 0.8700657894736842,
 'precision': 0.2872340425531915,
 'recall': 0.6923076923076923,
 'f1': 0.40601503759398494}