In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pickle

# Load dataset
df = pd.read_csv(r"C:\Users\madz\Documents\GitHub\appdev\playerperformance\datasets\players_15.csv")

# Drop rows with missing target or features
df = df.dropna(subset=['overall', 'preferred_foot', 'work_rate', 'player_positions'])

# Simplify player_positions
def simplify_position(pos):
    if any(p in pos for p in ['CB', 'LB', 'RB', 'LWB', 'RWB', 'CDM']):
        return 'DEF'
    elif any(p in pos for p in ['CM', 'CAM', 'RM', 'LM']):
        return 'MID'
    else:
        return 'ATT'

df['player_position_group'] = df['player_positions'].apply(simplify_position)

# Encode categorical variables
le_foot = LabelEncoder()
df['preferred_foot_enc'] = le_foot.fit_transform(df['preferred_foot'])

le_work = LabelEncoder()
df['work_rate_enc'] = le_work.fit_transform(df['work_rate'])

le_pos = LabelEncoder()
df['position_group_enc'] = le_pos.fit_transform(df['player_position_group'])

# Define features and target
features = ['age', 'height_cm', 'weight_kg', 'pace', 'shooting', 'passing',
            'dribbling', 'defending', 'physic',
            'preferred_foot_enc', 'work_rate_enc', 'position_group_enc']
target = 'overall'

X = df[features]
y = df[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds) ** 0.5
print(f"📉 RMSE: {rmse:.2f}")

# Save model
with open("xgboost_fifa_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved as 'xgboost_fifa_model.pkl'")
