In [None]:
import pandas as pd
import numpy as np 
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy.stats as stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;">1. Data Exploration
#### Start by loading and exploring the train.csv data to understand its structure, such as available columns and their role.

In [None]:
# Load the train.csv and test.csv file
df_train = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv")
df_test = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv")

In [None]:
df_train.head()

In [None]:
df_train.describe().transpose()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 2. Data visualization

### B. Scatterplot

In [None]:
plt.figure(figsize=(15, 5))
sns.scatterplot(x='utility_agent1', y='num_wins_agent1', data=df_train)
plt.title("Relationship between agent 1's utility and his victories")
# Rotation des étiquettes sur l'axe des x à 90 degrés
plt.xticks(rotation=90)
plt.show();

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 3. Data preprocessing

In [None]:
#Fill with a specific value (eg 0 or -1)
df_train.fillna(0, inplace=True)  # Remplace par 0

#Deleting rows or columns with missing values
df_train.dropna(inplace=True)

#Delete rows containing missing values:
df_train.dropna(axis=1, inplace=True)


In [None]:
# Identifying non-numeric columns
non_numeric_columns = df_train.select_dtypes(exclude=['float', 'int']).columns
non_numeric_columns

In [None]:
from sklearn.impute import KNNImputer

# Exclude non-numeric columns for imputation
numeric_columns = df_train.select_dtypes(include=['float', 'int']).columns
imputer = KNNImputer(n_neighbors=5)
df_train_imputed = pd.DataFrame(imputer.fit_transform(df_train[numeric_columns]), columns=numeric_columns)
df_train_imputed

In [None]:
from sklearn.preprocessing import LabelEncoder

#converting non-numeric columns to numeric categories
label_encoder = LabelEncoder()

for col in non_numeric_columns:
    df_train[col] = label_encoder.fit_transform(df_train[col])
df_train[col]

In [None]:
#Apply imputation after processing
imputer = KNNImputer(n_neighbors=5)
df_train_imputed = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns)
df_train_imputed

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 3. Normalization and standardization

In [None]:
# Identify categorical columns and encode them
categorical_columns = df_train.select_dtypes(include=['object']).columns
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    df_train[column] = le.fit_transform(df_train[column])
    label_encoders[column] = le  # Store the encoder for future use

# Now drop the target variable and define features and target
X = df_train.drop('utility_agent1', axis=1)  # Features
y = df_train['utility_agent1']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Normalize the data 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the linear regression model
model = LinearRegression()

# Step 5: Cross-validation to evaluate performance on the training data
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mean_score = -cv_scores.mean()  # Reverse the sign to get positive MSE

print(f"Cross-Validation - Mean Squared Error (MSE): {cv_mean_score}")

# Train the model on the training set
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate model performance on the test set
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print(f"Test Set - Mean Squared Error (MSE): {mse}")
print(f"Test Set - Mean Absolute Error (MAE): {mae}")
print(f"Test Set - Coefficient of Determination (R²): {r2}")

# Save the prediction results
predictions_df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
predictions_df.to_csv('submission.parquet', index=False)

In [None]:
submission = pd.read_csv('/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv')
print(submission.head())

In [None]:
print(f"Number of rows in submission file: {len(submission)}")
print(f"Number of predictions: {len(y_pred)}")


In [None]:
submission.to_parquet('submission.parquet', index=False)
print("Le fichier de soumission a été enregistré avec succès sous le nom 'submission.parquet'")


In [None]:
print("""
This notebook demonstrates a complete pipeline for training a regression model using RandomForest. 
We optimized the model's hyperparameters with GridSearchCV and evaluated its performance through cross-validation. 
Finally, the predictions are saved in 'submission.parquet' format, ready for submission to the Kaggle competition.
""")
