# Preprocesamiento

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('final_dataset.csv')

# Convertir fechas
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')

# Eliminar primeras 3 jornadas
data = data[data['MW'] > 3]

# Selección de variables
X = data.drop(['FTR', 'Date', 'MW', 'FTHG', 'FTAG', 'HTFormPtsStr', 'ATFormPtsStr'], axis=1, errors='ignore')
y = data['FTR']

#  Codificación de equipos
le_home = LabelEncoder()
le_away = LabelEncoder()

if 'HomeTeam' in X.columns:
    X['HomeTeam'] = le_home.fit_transform(X['HomeTeam'])

if 'AwayTeam' in X.columns:
    X['AwayTeam'] = le_away.fit_transform(X['AwayTeam'])

# Escalado de numéricas
scaler = StandardScaler()

columns_to_scale = ['HTGD', 'ATGD', 'HTP', 'ATP',
                    'DiffPts', 'DiffFormPts', 'DiffLP']

X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

# División train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Check final
print('Shape X:', X.shape)
print('Shape train:', X_train.shape, 'Shape test:', X_test.shape)
print('Target distribution in train:\n', y_train.value_counts(normalize=True))


Shape X: (5600, 36)
Shape train: (4480, 36) Shape test: (1120, 36)
Target distribution in train:
 FTR
NH    0.535268
H     0.464732
Name: proportion, dtype: float64
