In [2]:
import pandas as pd

from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [14]:
df = pd.read_csv('./1st_2nd_tiers_top_5_leagues.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
df.dropna(inplace=True)
df = df[df['tier'] == 1]
df.dtypes

year                  int64
country              object
league               object
tier                  int64
team                 object
position            float64
squad_depth           int64
avg_age             float64
foreigners            int64
avg_market_value      int64
market_value          int64
has_relegated        object
has_promoted         object
has_won_titles         bool
will_promote         object
will_relegate        object
dtype: object

In [6]:
# Obtén los años únicos y selecciona el 80% para entrenamiento y 20% para prueba
years = df['year'].unique()
years = years[years != 2023]
years_train, years_test = train_test_split(years, test_size=0.2, random_state=42)

# Filtra los datos para crear los conjuntos de entrenamiento y prueba
train_data = df[df['year'].isin(years_train)]
test_data = df[df['year'].isin(years_test)]

# Divide las características y la variable objetivo
X_train = train_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_train = train_data['will_relegate']

X_test = test_data.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
y_test = test_data['will_relegate']


# Comprueba la distribución de años
print("Años en entrenamiento:", years_train)
print("Años en prueba:", years_test)

Años en entrenamiento: [2008 2018 2021 2020 2016 2007 2014 2022 2009 2017 2012 2015 2019 2011]
Años en prueba: [2005 2006 2013 2010]


In [17]:
y_train = y_train.astype(bool)
y_test = y_test.astype(bool)

In [18]:
criterios = ['gini', 'entropy', 'log_loss']
profundidades = [3, 5, 10, 15, 25, 50]
estimators = [10, 20, 25, 50, 100]
mejor_clf = {'profundidad':'',
              'criterio':'',
              'estimator': '',
              'pesos_personalizados': '',
              'accuracy':'',
              'precision':'',
              'recall':'',
              'f1':''}
mejor_f1 = 0

# Asignamos un peso mayor para la clase 'False' y un peso menor para la clase 'True'
pesos_personalizados = ['balanced_subsample', 'balanced', {0: 1, 1: 5}, {0: 1, 1: 10}]

In [19]:
for criterio in tqdm(criterios):
    for estimator in estimators:
        for profundidad in profundidades:
            for peso in pesos_personalizados:

                # Crear el modelo de RandomForest
                rfc = RandomForestClassifier(n_estimators=estimator, criterion=criterio, max_depth=profundidad, random_state=1337, class_weight=peso)
                rfc.fit(X_train, y_train)
                predicciones = rfc.predict(X_test)
                
                # Calcular las métricas
                acc = accuracy_score(y_test, predicciones)
                prec = precision_score(y_test, predicciones)
                rec = recall_score(y_test, predicciones)
                f1 = f1_score(y_test, predicciones)

                if f1 >= mejor_f1:
                    mejor_f1 = f1
                    mejor_clf['profundidad'] = profundidad
                    mejor_clf['criterio'] = criterio
                    mejor_clf['estimator'] = estimator
                    mejor_clf['pesos_personalizados'] = peso
                    mejor_clf['accuracy'] = acc
                    mejor_clf['precision'] = prec
                    mejor_clf['recall'] = rec
                    mejor_clf['f1'] = f1

100%|██████████| 3/3 [00:31<00:00, 10.61s/it]


In [73]:
mejor_clf

{'profundidad': 5,
 'criterio': 'log_loss',
 'estimator': 25,
 'pesos_personalizados': 'balanced_subsample',
 'accuracy': 0.6845238095238095,
 'precision': 0.32857142857142857,
 'recall': 0.7931034482758621,
 'f1': 0.46464646464646464}

In [20]:
rfc = RandomForestClassifier(n_estimators=mejor_clf['estimator'], criterion=mejor_clf['criterio'], max_depth=mejor_clf['profundidad'], random_state=1337, class_weight=mejor_clf['pesos_personalizados'])
rfc

In [21]:
rfc.fit(X_train, y_train)
predicciones = rfc.predict(X_test)

In [22]:
report = classification_report(y_test, predicciones, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df[['precision', 'recall', 'f1-score', 'support']])

              precision    recall  f1-score     support
False          0.921922  0.919162  0.920540  334.000000
True           0.542373  0.551724  0.547009   58.000000
accuracy       0.864796  0.864796  0.864796    0.864796
macro avg      0.732147  0.735443  0.733774  392.000000
weighted avg   0.865764  0.864796  0.865272  392.000000


In [28]:
nuevo_df = pd.read_csv('./1st_2nd_tiers_top_5_leagues.csv')
nuevo_df.drop(columns=["Unnamed: 0"], inplace=True)
nuevo_df = nuevo_df[nuevo_df['tier'] == 1]
predict_df = nuevo_df.drop(columns=['country', 'league', 'team', 'position', 'will_promote', 'will_relegate'])
predict_df = predict_df[predict_df['year'] == 2023]

predict_df

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles
380,2023,1,36,25.7,21,40630000,1460000000,False,False,True
381,2023,1,40,24.6,23,30080000,1200000000,False,False,False
382,2023,1,56,22.2,26,18120000,1010000000,False,False,False
383,2023,1,45,24.2,30,21240000,955850000,False,False,False
384,2023,1,41,25.6,27,20450000,838500000,False,False,False
...,...,...,...,...,...,...,...,...,...,...
3822,2023,1,43,25.2,20,2930000,125900000,False,False,False
3823,2023,1,31,25.9,9,3710000,115150000,False,False,False
3824,2023,1,35,25.2,19,2069999,72500000,False,True,True
3825,2023,1,37,25.5,27,1770000,65400000,False,True,False


In [29]:
predicciones_nuevo_df = rfc.predict(predict_df)
probs = rfc.predict_proba(predict_df)
probs_descender = probs[:, 1]

# Add 'team' and 'position' to nuevo_df (if needed, you can keep them for the final prediction)
test_df = pd.read_csv('./1st_2nd_tiers_top_5_leagues.csv')
predict_df['league'] = test_df['league']
predict_df['team'] = nuevo_df['team']
predict_df['team'] = nuevo_df['team']
predict_df['position'] = nuevo_df['position']

# Add the predictions to the DataFrame
predict_df['predicciones'] = predicciones_nuevo_df
predict_df['probabilidades'] = probs_descender

In [30]:
predict_df.sort_values(by=['league', 'position'])

Unnamed: 0,year,tier,squad_depth,avg_age,foreigners,avg_market_value,market_value,has_relegated,has_promoted,has_won_titles,league,team,position,predicciones,probabilidades
2149,2023,1,32,24.9,23,20570000,658350000,False,False,False,Bundesliga,Bayer 04 Leverkusen,1.0,False,0.000000
2152,2023,1,38,24.4,19,9120000,346630000,False,False,False,Bundesliga,VfB Stuttgart,2.0,False,0.040000
2148,2023,1,39,25.2,23,24750000,965150000,False,False,True,Bundesliga,Bayern Munich,3.0,False,0.000000
2150,2023,1,32,25.2,22,17040000,545400000,False,False,True,Bundesliga,RB Leipzig,4.0,False,0.000000
2151,2023,1,36,25.6,17,13740000,494800000,False,False,False,Bundesliga,Borussia Dortmund,5.0,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,2023,1,38,27.0,19,2430000,92230000,False,True,False,Serie-A,Cagliari Calcio,16.0,True,0.653370
2959,2023,1,47,25.3,20,2180000,102330000,False,False,False,Serie-A,FC Empoli,17.0,True,0.575919
2957,2023,1,45,25.3,25,2400000,108130000,False,True,True,Serie-A,Frosinone Calcio,18.0,True,0.612709
2956,2023,1,38,25.5,21,3670000,139540000,False,False,False,Serie-A,US Sassuolo,19.0,False,0.231751


In [31]:
predict_df.to_csv('predict.csv')