In [39]:
import pandas as pd

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [40]:
df = pd.read_csv('./spain_df.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
#df = df[df['tier'] == 1]
df

Unnamed: 0,year,country,league,tier,team,position,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles,will_promote,will_relegate
0,2005,ES,LaLiga,1,FC Barcelona,1.0,34,25.4,16,9630000,327500000,False,False,True,False,False
1,2005,ES,LaLiga,1,Real Madrid,2.0,36,25.4,13,7820000,281600000,False,False,False,False,False
2,2005,ES,LaLiga,1,Valencia CF,3.0,34,27.3,14,6280000,213550000,False,False,False,False,False
3,2005,ES,LaLiga,1,Atlético de Madrid,10.0,31,24.2,8,4330000,134150000,False,False,False,False,False
4,2005,ES,LaLiga,1,Deportivo de La Coruña,8.0,34,27.5,8,3940000,133800000,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,2023,ES,LaLiga2,2,Racing Ferrol,10.0,29,28.6,2,641000,18600000,False,True,False,False,False
794,2023,ES,LaLiga2,2,FC Cartagena,14.0,42,27.4,12,369000,15500000,False,False,False,False,False
795,2023,ES,LaLiga2,2,CD Eldense,16.0,33,28.0,7,397000,13100000,False,True,False,False,False
796,2023,ES,LaLiga2,2,SD Amorebieta,19.0,28,27.1,6,455000,12750000,False,True,False,False,True


In [41]:
# Obtén los años únicos y selecciona el 80% para entrenamiento y 20% para prueba
years = df['year'].unique()
years = years[years != 2023]
years_train, years_test = train_test_split(years, test_size=0.2, random_state=42)

# Filtra los datos para crear los conjuntos de entrenamiento y prueba
train_data = df[df['year'].isin(years_train)]
test_data = df[df['year'].isin(years_test)]

# Divide las características y la variable objetivo
X_train = train_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_train = train_data['will_relegate']

X_test = test_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_test = test_data['will_relegate']


# Comprueba la distribución de años
print("Años en entrenamiento:", years_train)
print("Años en prueba:", years_test)

Años en entrenamiento: [2008 2018 2021 2020 2016 2007 2014 2022 2009 2017 2012 2015 2019 2011]
Años en prueba: [2005 2006 2013 2010]


In [42]:
criterios = ['gini', 'entropy', 'log_loss']
profundidades = [3, 5, 10, 15, 25, 50]
estimators = [10, 20, 25, 50, 100]
mejor_clf = {'profundidad':'',
              'criterio':'',
              'estimator': '',
              'pesos_personalizados': '',
              'accuracy':'',
              'precision':'',
              'recall':'',
              'f1':''}
mejor_f1 = 0

# Asignamos un peso mayor para la clase 'False' y un peso menor para la clase 'True'
pesos_personalizados = ['balanced_subsample', 'balanced', {0: 1, 1: 5}, {0: 1, 1: 10}]

In [43]:
for criterio in tqdm(criterios):
    for estimator in estimators:
        for profundidad in profundidades:
            for peso in pesos_personalizados:

                # Crear el modelo de RandomForest
                rfc = RandomForestClassifier(n_estimators=estimator, criterion=criterio, max_depth=profundidad, random_state=1337, class_weight=peso)
                rfc.fit(X_train, y_train)
                predicciones = rfc.predict(X_test)
                
                # Calcular las métricas
                acc = accuracy_score(y_test, predicciones)
                prec = precision_score(y_test, predicciones)
                rec = recall_score(y_test, predicciones)
                f1 = f1_score(y_test, predicciones)

                if f1 >= mejor_f1:
                    mejor_f1 = f1
                    mejor_clf['profundidad'] = profundidad
                    mejor_clf['criterio'] = criterio
                    mejor_clf['estimator'] = estimator
                    mejor_clf['pesos_personalizados'] = peso
                    mejor_clf['accuracy'] = acc
                    mejor_clf['precision'] = prec
                    mejor_clf['recall'] = rec
                    mejor_clf['f1'] = f1

100%|██████████| 3/3 [00:18<00:00,  6.29s/it]


In [44]:
mejor_clf

{'profundidad': 5,
 'criterio': 'log_loss',
 'estimator': 25,
 'pesos_personalizados': 'balanced_subsample',
 'accuracy': 0.6845238095238095,
 'precision': np.float64(0.32857142857142857),
 'recall': np.float64(0.7931034482758621),
 'f1': np.float64(0.46464646464646464)}

In [45]:
rfc = RandomForestClassifier(n_estimators=mejor_clf['estimator'], criterion=mejor_clf['criterio'], max_depth=mejor_clf['profundidad'], random_state=1337, class_weight=mejor_clf['pesos_personalizados'])
rfc

In [46]:
rfc.fit(X_train, y_train)
predicciones = rfc.predict(X_test)

In [47]:
report = classification_report(y_test, predicciones, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df[['precision', 'recall', 'f1-score', 'support']])

              precision    recall  f1-score     support
False          0.938776  0.661871  0.776371  139.000000
True           0.328571  0.793103  0.464646   29.000000
accuracy       0.684524  0.684524  0.684524    0.684524
macro avg      0.633673  0.727487  0.620509  168.000000
weighted avg   0.833443  0.684524  0.722562  168.000000


In [48]:
nuevo_df = pd.read_csv('./spain_df.csv')
nuevo_df.drop(columns=["Unnamed: 0"], inplace=True)
nuevo_df = nuevo_df[nuevo_df['tier'] == 1]
predict_df = nuevo_df.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
predict_df = predict_df[predict_df['year'] == 2023]

predict_df

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
360,2023,1,37,25.6,21,30680000,1140000000,False,False,True
361,2023,1,38,24.0,15,24110000,916200000,False,False,True
362,2023,1,40,25.2,10,12460000,498500000,False,False,False
363,2023,1,42,26.9,23,10920000,458450000,False,False,False
364,2023,1,37,25.6,15,8050000,297900000,False,False,False
365,2023,1,31,27.0,2,9540000,295650000,False,False,False
366,2023,1,39,27.2,15,6470000,252500000,False,False,False
367,2023,1,36,23.8,11,6800000,244730000,False,False,False
368,2023,1,49,26.7,27,4670000,228750000,False,False,True
369,2023,1,42,27.0,21,5350000,224880000,False,False,False


In [49]:
predicciones_nuevo_df = rfc.predict(predict_df)
probs = rfc.predict_proba(predict_df)
probs_descender = probs[:, 1]

# Add 'team' and 'position' to nuevo_df (if needed, you can keep them for the final prediction)
predict_df['team'] = nuevo_df['team']
predict_df['position'] = nuevo_df['position']

# Add the predictions to the DataFrame
predict_df['predicciones'] = predicciones_nuevo_df
predict_df['probabilidades'] = probs_descender

In [50]:
predict_df.sort_values(by='position')

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles,team,position,predicciones,probabilidades
360,2023,1,37,25.6,21,30680000,1140000000,False,False,True,Real Madrid,1.0,False,0.0
361,2023,1,38,24.0,15,24110000,916200000,False,False,True,FC Barcelona,2.0,False,0.0
364,2023,1,37,25.6,15,8050000,297900000,False,False,False,Girona FC,3.0,False,0.0
363,2023,1,42,26.9,23,10920000,458450000,False,False,False,Atlético de Madrid,4.0,False,0.0
365,2023,1,31,27.0,2,9540000,295650000,False,False,False,Athletic Bilbao,5.0,False,0.00391
362,2023,1,40,25.2,10,12460000,498500000,False,False,False,Real Sociedad,6.0,False,0.0
369,2023,1,42,27.0,21,5350000,224880000,False,False,False,Real Betis Balompié,7.0,False,0.0
366,2023,1,39,27.2,15,6470000,252500000,False,False,False,Villarreal CF,8.0,False,0.0
367,2023,1,36,23.8,11,6800000,244730000,False,False,False,Valencia CF,9.0,False,0.0
373,2023,1,41,24.5,15,3020000,123700000,False,True,False,Deportivo Alavés,10.0,False,0.45102
