In [1]:
import pandas as pd

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('./spain_df.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
#df = df[df['tier'] == 1]
df

Unnamed: 0,year,country,league,tier,team,position,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles,will_promote,will_relegate
0,2005,ES,LaLiga,1,FC Barcelona,1.0,34,25.4,16,9630000,327500000,False,False,True,False,False
1,2005,ES,LaLiga,1,Real Madrid,2.0,36,25.4,13,7820000,281600000,False,False,False,False,False
2,2005,ES,LaLiga,1,Valencia CF,3.0,34,27.3,14,6280000,213550000,False,False,False,False,False
3,2005,ES,LaLiga,1,CA Osasuna,4.0,25,26.6,7,1630000,40700000,False,False,False,False,False
4,2005,ES,LaLiga,1,Sevilla FC,5.0,34,26.0,10,2850000,96850000,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,2023,ES,LaLiga2,2,CD Mirandés,18.0,36,23.7,11,576000,20750000,False,False,False,False,False
794,2023,ES,LaLiga2,2,SD Amorebieta,19.0,28,27.1,6,455000,12750000,False,True,False,False,True
795,2023,ES,LaLiga2,2,AD Alcorcón,20.0,38,26.4,6,298000,11330000,False,True,False,False,True
796,2023,ES,LaLiga2,2,FC Andorra,21.0,32,25.9,6,639000,20450000,False,False,False,False,True


In [3]:
# Obtén los años únicos y selecciona el 80% para entrenamiento y 20% para prueba
years = df['year'].unique()
years_train, years_test = train_test_split(years, test_size=0.2, random_state=42)

# Filtra los datos para crear los conjuntos de entrenamiento y prueba
train_data = df[df['year'].isin(years_train)]
test_data = df[df['year'].isin(years_test)]

# Divide las características y la variable objetivo
X_train = train_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_train = train_data['will_relegate']

X_test = test_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_test = test_data['will_relegate']


# Comprueba la distribución de años
print("Años en entrenamiento:", years_train)
print("Años en prueba:", years_test)

Años en entrenamiento: [2013 2021 2008 2018 2020 2022 2007 2014 2023 2009 2017 2012 2015 2019
 2011]
Años en prueba: [2005 2010 2016 2006]


In [4]:
criterios = ['gini', 'entropy', 'log_loss']
profundidades = [3, 5, 10, 15, 25, 50]
estimators = [10, 20, 25, 50, 100]
mejor_clf = {'profundidad':'',
              'criterio':'',
              'estimator': '',
              'pesos_personalizados': '',
              'accuracy':'',
              'precision':'',
              'recall':'',
              'f1':''}
mejor_f1 = 0

# Asignamos un peso mayor para la clase 'False' y un peso menor para la clase 'True'
pesos_personalizados = ['balanced_subsample', 'balanced', {0: 1, 1: 5}, {0: 1, 1: 10}]

In [5]:
for criterio in tqdm(criterios):
    for estimator in estimators:
        for profundidad in profundidades:
            for peso in pesos_personalizados:

                # Crear el modelo de RandomForest
                rfc = RandomForestClassifier(n_estimators=estimator, criterion=criterio, max_depth=profundidad, random_state=1337, class_weight=peso)
                rfc.fit(X_train, y_train)
                predicciones = rfc.predict(X_test)
                
                # Calcular las métricas
                acc = accuracy_score(y_test, predicciones)
                prec = precision_score(y_test, predicciones)
                rec = recall_score(y_test, predicciones)
                f1 = f1_score(y_test, predicciones)

                if f1 >= mejor_f1:
                    mejor_f1 = f1
                    mejor_clf['profundidad'] = profundidad
                    mejor_clf['criterio'] = criterio
                    mejor_clf['estimator'] = estimator
                    mejor_clf['pesos_personalizados'] = peso
                    mejor_clf['accuracy'] = acc
                    mejor_clf['precision'] = prec
                    mejor_clf['recall'] = rec
                    mejor_clf['f1'] = f1

100%|██████████| 3/3 [00:23<00:00,  7.87s/it]


In [6]:
mejor_clf

{'profundidad': 5,
 'criterio': 'log_loss',
 'estimator': 20,
 'pesos_personalizados': 'balanced_subsample',
 'accuracy': 0.7380952380952381,
 'precision': 0.3684210526315789,
 'recall': 0.7241379310344828,
 'f1': 0.4883720930232558}

In [7]:
rfc = RandomForestClassifier(n_estimators=mejor_clf['estimator'], criterion=mejor_clf['criterio'], max_depth=mejor_clf['profundidad'], random_state=1337, class_weight=mejor_clf['pesos_personalizados'])
rfc

In [8]:
rfc.fit(X_train, y_train)
predicciones = rfc.predict(X_test)

In [9]:
report = classification_report(y_test, predicciones, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df[['precision', 'recall', 'f1-score', 'support']])

              precision    recall  f1-score     support
False          0.927928  0.741007  0.824000  139.000000
True           0.368421  0.724138  0.488372   29.000000
accuracy       0.738095  0.738095  0.738095    0.738095
macro avg      0.648174  0.732573  0.656186  168.000000
weighted avg   0.831346  0.738095  0.766064  168.000000


In [10]:
nuevo_df = pd.read_csv('./spain_df.csv')
nuevo_df.drop(columns=["Unnamed: 0"], inplace=True)
nuevo_df = nuevo_df[nuevo_df['tier'] == 1]
predict_df = nuevo_df.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
predict_df = predict_df[predict_df['year'] == 2023]

predict_df

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
756,2023,1,37,25.6,21,30680000,1140000000,False,False,True
757,2023,1,38,24.0,15,24110000,916200000,False,False,True
758,2023,1,37,25.6,15,8050000,297900000,False,False,False
759,2023,1,42,26.9,23,10920000,458450000,False,False,False
760,2023,1,31,27.0,2,9540000,295650000,False,False,False
761,2023,1,40,25.2,10,12460000,498500000,False,False,False
762,2023,1,42,27.0,21,5350000,224880000,False,False,False
763,2023,1,39,27.2,15,6470000,252500000,False,False,False
764,2023,1,36,23.8,11,6800000,244730000,False,False,False
765,2023,1,41,24.5,15,3020000,123700000,True,False,False


In [11]:
predicciones_nuevo_df = rfc.predict(predict_df)

# Add 'team' and 'position' to nuevo_df (if needed, you can keep them for the final prediction)
predict_df['team'] = nuevo_df['team']
predict_df['position'] = nuevo_df['position']

# Add the predictions to the DataFrame
predict_df['predicciones'] = predicciones_nuevo_df

In [12]:
predict_df

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles,team,position,predicciones
756,2023,1,37,25.6,21,30680000,1140000000,False,False,True,Real Madrid,1.0,False
757,2023,1,38,24.0,15,24110000,916200000,False,False,True,FC Barcelona,2.0,False
758,2023,1,37,25.6,15,8050000,297900000,False,False,False,Girona FC,3.0,False
759,2023,1,42,26.9,23,10920000,458450000,False,False,False,Atlético de Madrid,4.0,False
760,2023,1,31,27.0,2,9540000,295650000,False,False,False,Athletic Bilbao,5.0,False
761,2023,1,40,25.2,10,12460000,498500000,False,False,False,Real Sociedad,6.0,False
762,2023,1,42,27.0,21,5350000,224880000,False,False,False,Real Betis Balompié,7.0,False
763,2023,1,39,27.2,15,6470000,252500000,False,False,False,Villarreal CF,8.0,False
764,2023,1,36,23.8,11,6800000,244730000,False,False,False,Valencia CF,9.0,False
765,2023,1,41,24.5,15,3020000,123700000,True,False,False,Deportivo Alavés,10.0,True
