In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy.stats as stats
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;">1. Data Exploration
#### Start by loading and exploring the train.csv data to understand its structure, such as available columns and their role.

In [None]:
# Load the train.csv and test.csv file
df_train = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv")
df_test = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv")

In [None]:
df_test

In [None]:
df_train.head()

In [None]:
df_train.describe().transpose()

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 2. Data visualization

### A. Distribution of variables (Histograms)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_train['num_wins_agent1'], kde=True, bins=30)
plt.title("Agent 1 Victory Distribution")
plt.show();

### B. Boxplot to understand the relationship with the possibility
#### the relationship between variables and the target

In [None]:
plt.figure(figsize=(20, 6))
sns.boxplot(x='utility_agent1', y='num_wins_agent1', data=df_train)
plt.title("Relationship between agent 1's utility and his victories")
# Rotation des étiquettes sur l'axe des x à 90 degrés
plt.xticks(rotation=90)
plt.show();

### C. Scatterplot

In [None]:
plt.figure(figsize=(15, 5))
sns.scatterplot(x='utility_agent1', y='num_wins_agent1', data=df_train)
plt.title("Relationship between agent 1's utility and his victories")
# Rotation des étiquettes sur l'axe des x à 90 degrés
plt.xticks(rotation=90)
plt.show();

### D. Pairplot 
#### the relationship between several variables

In [None]:
sns.pairplot(df_train[['num_wins_agent1', 'num_losses_agent1', 'utility_agent1']])
plt.show();

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 3. Data preprocessing

In [None]:
#Fill with a specific value (eg 0 or -1)
df_train.fillna(0, inplace=True)  # Remplace par 0

#Deleting rows or columns with missing values
df_train.dropna(inplace=True)

#Delete rows containing missing values:
df_train.dropna(axis=1, inplace=True)


In [None]:
# Identifying non-numeric columns
non_numeric_columns = df_train.select_dtypes(exclude=['float', 'int']).columns
non_numeric_columns

In [None]:
from sklearn.impute import KNNImputer

# Exclude non-numeric columns for imputation
numeric_columns = df_train.select_dtypes(include=['float', 'int']).columns
imputer = KNNImputer(n_neighbors=5)
df_train_imputed = pd.DataFrame(imputer.fit_transform(df_train[numeric_columns]), columns=numeric_columns)
df_train_imputed

In [None]:
from sklearn.preprocessing import LabelEncoder

#converting non-numeric columns to numeric categories
label_encoder = LabelEncoder()

for col in non_numeric_columns:
    df_train[col] = label_encoder.fit_transform(df_train[col])
df_train[col]

In [None]:
#Apply imputation after processing
imputer = KNNImputer(n_neighbors=5)
df_train_imputed = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns)
df_train_imputed

## <p style="font-family: Roboto; font-size: 130%; color: blue; margin-top: 0;"> 4. Normalization and standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# Select only numeric columns
numeric_columns = df_train.select_dtypes(include=['float', 'int']).columns

# Create a StandardScaler object
scaler = StandardScaler()

# Apply standardization to numeric columns
df_train[numeric_columns] = scaler.fit_transform(df_train[numeric_columns])

# Display the first rows of the normalized DataFrame
df_train.head()

In [None]:
df_train

In [None]:
df_train.describe().transpose()

In [None]:
df_train.columns

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
# Séparation of explicative variables (X) and the explanation (y)
X = df_train.drop(['utility_agent1'], axis=1)  # Assure-toi que cette colonne est la cible
y = df_train['utility_agent1']

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train the RandomForest model for regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

 ## Calculate the Mean Squared Error (MSE)

In [None]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


### calculate and interpret other performance metrics such as Mean Absolute Error (MAE) and Coefficient of Determination (R²), in addition to Mean Squared Error (MSE), to evaluate the regression model.

## A : Calculate the Mean Absolute Error (MAE)

In [None]:
from sklearn.metrics import mean_absolute_error

# Calculer le MAE
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")


Interpretation: The MAE represents the mean absolute error between the actual and predicted values.
With a MAE of 0.00020986, this means that, on average, the predictions differ from the actual values ​​by 0.0002 units.
This is a very small error, which shows that the model makes very accurate predictions.

## B: Calculate the Coefficient of Determination (R²)

In [None]:
from sklearn.metrics import r2_score

# Calculer le R²
r2 = r2_score(y_test, y_pred)

print(f"Coefficient of Determination (R²): {r2}")

Interpretation: The R² coefficient explains how much of the variance in the data is explained by the model.
An R² of 0.99997 means that 99.997% of the variance in the data is explained by the model.
This is a near-perfect score, indicating that the model captures the relationships in the data extremely well.

## *check if the model performs well on test data and ensure that there is no overfitting*

In [None]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
# Data preparation
X = df_train.drop('utility_agent1', axis=1) 
y = df_train['utility_agent1'] 

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the linear regression model

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

In [None]:
# Cross validation to evaluate performance on training data
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mean_score = -cv_scores.mean() 

In [None]:
print(f"Validation Croisée - Mean Squared Error (MSE) : {cv_mean_score}")

In [None]:
# Load the sample_submission.csv file provided by Kaggle
submission = pd.read_csv('/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv')

# Generate predictions
y_pred = np.random.uniform(-1.0, 1.0, size=len(submission))  # Example of random predictions

# Limit the predictions between -1 and 1 to meet the competition requirements
y_pred = np.clip(y_pred, -1.0, 1.0)

# Replace the 'utility_agent1' column with the predictions
submission['utility_agent1'] = y_pred  

# Export the predictions to a CSV file for submission
submission.to_csv('submission.parquet', index=False)

print("Submission file successfully exported with the name 'submission.parquet'")