# Preprocesamiento

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)  # Mostrar todas las columnas

In [2]:
data = pd.read_csv('final_dataset.csv')

# Revisar distintos tipo de columna FTR
print("\nValores únicos en la columna FTR:")
print(data['FTR'].unique())


Valores únicos en la columna FTR:
['H' 'NH']


In [6]:
print("\nPartidos más recientes:")
recent_matches = data.sort_values(by='Date', ascending=False).head(10)
display(recent_matches)


Partidos más recientes:


Unnamed: 0.1,Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTGS,ATGS,HTGC,ATGC,HTP,ATP,HM1,HM2,HM3,HM4,HM5,AM1,AM2,AM3,AM4,AM5,HomeTeamLP,AwayTeamLP,MW,HTFormPtsStr,ATFormPtsStr,HTFormPts,ATFormPts,HTWinStreak3,HTWinStreak5,HTLossStreak3,HTLossStreak5,ATWinStreak3,ATWinStreak5,ATLossStreak3,ATLossStreak5,HTGD,ATGD,DiffPts,DiffFormPts,DiffLP
6079,6079,2016-05-17,Man United,Bournemouth,3,1,H,46,44,34,64,1.657895,1.105263,L,W,D,W,W,D,L,L,L,W,4.0,18.0,38.0,LWDWW,DLLLW,10,4,0,0,0,0,0,0,0,0,0.315789,-0.526316,0.552632,0.157895,-14.0
6074,6074,2016-05-15,Southampton,Crystal Palace,4,1,H,55,38,40,47,1.578947,1.105263,W,W,W,D,W,W,L,L,D,D,7.0,10.0,38.0,WWWDW,WLLDD,13,5,0,0,0,0,0,0,0,0,0.394737,-0.236842,0.473684,0.210526,-3.0
6071,6071,2016-05-15,Chelsea,Leicester,1,1,NH,58,67,52,35,1.289474,2.105263,D,L,D,W,L,W,D,W,D,W,1.0,14.0,38.0,DLDWL,WDWDW,5,11,0,0,0,0,0,0,0,0,0.157895,0.842105,-0.815789,-0.157895,-13.0
6072,6072,2016-05-15,Everton,Norwich,3,0,H,56,39,55,64,1.157895,0.894737,L,L,W,L,D,W,L,L,L,L,11.0,18.0,38.0,LLWLD,WLLLL,4,3,0,0,0,0,0,0,1,0,0.026316,-0.657895,0.263158,0.026316,-7.0
6073,6073,2016-05-15,Newcastle,Tottenham,5,1,H,39,68,64,30,0.894737,1.842105,D,W,D,D,W,L,D,D,W,W,15.0,5.0,38.0,DWDDW,LDDWW,9,8,0,0,0,0,0,0,0,0,-0.657895,1.0,-0.947368,0.026316,10.0
6070,6070,2016-05-15,Arsenal,Aston Villa,4,0,H,61,27,36,72,1.789474,0.447368,D,W,D,W,D,D,L,L,L,L,3.0,17.0,38.0,DWDWD,DLLLL,9,1,0,0,0,0,0,0,1,0,0.657895,-1.184211,1.342105,0.210526,-14.0
6075,6075,2016-05-15,Stoke,West Ham,2,1,H,39,64,54,49,1.263158,1.631579,L,D,L,L,L,W,L,W,W,D,9.0,12.0,38.0,LDLLL,WLWWD,1,10,0,0,1,0,0,0,0,0,-0.394737,0.394737,-0.368421,-0.236842,-3.0
6076,6076,2016-05-15,Swansea,Man City,1,1,NH,41,70,51,40,1.210526,1.710526,W,W,L,L,W,D,L,W,D,W,8.0,2.0,38.0,WWLLW,DLWDW,9,8,0,0,0,0,0,0,0,0,-0.263158,0.789474,-0.5,0.026316,6.0
6077,6077,2016-05-15,Watford,Sunderland,2,2,NH,38,46,48,60,1.157895,1.0,L,L,W,L,W,W,W,D,D,W,18.0,16.0,38.0,LLWLW,WWDDW,6,11,0,0,0,0,0,0,0,0,-0.263158,-0.368421,0.157895,-0.131579,2.0
6078,6078,2016-05-15,West Brom,Liverpool,1,1,NH,33,62,47,49,1.105263,1.552632,D,L,D,L,L,D,W,L,D,W,13.0,6.0,38.0,DLDLL,DWLDW,2,8,0,0,0,0,0,0,0,0,-0.368421,0.342105,-0.447368,-0.157895,7.0


In [None]:
data = pd.read_csv('final_dataset.csv')

# Convertir fechas
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')

# Eliminar primeras 3 jornadas
data = data[data['MW'] > 3]

# Selección de variables
X = data.drop(['FTR', 'Date', 'MW', 'FTHG', 'FTAG', 'HTFormPtsStr', 'ATFormPtsStr'], axis=1, errors='ignore')
y = data['FTR']

#  Codificación de equipos
le_home = LabelEncoder()
le_away = LabelEncoder()

if 'HomeTeam' in X.columns:
    X['HomeTeam'] = le_home.fit_transform(X['HomeTeam'])

if 'AwayTeam' in X.columns:
    X['AwayTeam'] = le_away.fit_transform(X['AwayTeam'])

# Escalado de numéricas
scaler = StandardScaler()

columns_to_scale = ['HTGD', 'ATGD', 'HTP', 'ATP',
                    'DiffPts', 'DiffFormPts', 'DiffLP']

X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

# División train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Check final
print('Shape X:', X.shape)
print('Shape train:', X_train.shape, 'Shape test:', X_test.shape)
print('Target distribution in train:\n', y_train.value_counts(normalize=True))


Shape X: (5600, 36)
Shape train: (4480, 36) Shape test: (1120, 36)
Target distribution in train:
 FTR
NH    0.535268
H     0.464732
Name: proportion, dtype: float64


In [3]:
# Combinar features y target
train = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
test = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)

# Guardar en CSV
train.to_csv('train_preprocessed.csv', index=False)
test.to_csv('test_preprocessed.csv', index=False)

print("Archivos guardados correctamente como 'train_preprocessed.csv' y 'test_preprocessed.csv'")

Archivos guardados correctamente como 'train_preprocessed.csv' y 'test_preprocessed.csv'
