In [1]:
# importation de quelques librairies

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

In [2]:
# chemin de nos données d'entrainement et de test

train_path = '../data/data_reg/wine_train.csv' # replace with your path
test_path = '../data/data_reg/wine_test.csv' # replace with your path

# chargement des données

df_train = pd.read_csv(train_path).drop('wine_ID', axis=1)
df_test = pd.read_csv(test_path)

## Analyse des données

In [3]:
# visualisation des données

df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,wine_type,target
0,7.2,0.16,0.26,7.1,0.054,41.0,224.0,0.9966,3.38,0.55,10.1,0,5
1,7.3,0.22,0.31,2.3,0.018,45.0,80.0,0.98936,3.06,0.34,12.9,0,7
2,8.9,0.13,0.49,1.0,0.028,6.0,24.0,0.9926,2.91,0.32,9.9,0,5
3,6.0,0.17,0.29,9.7,0.044,33.0,98.0,0.99536,3.12,0.36,9.2,0,6
4,7.5,0.19,0.34,2.6,0.037,33.0,125.0,0.9923,3.1,0.49,11.1,0,7


In [None]:
# informations sur les données

df_train.info()

In [None]:
# description des données

df_train.describe()

In [None]:
# forme des données

print(df_train.shape, df_test.shape)

In [None]:
# Disrtibution des classes

df_train['target'].hist()
plt.title('Quality distribution')
plt.xlabel('Quality')

In [None]:
# Matrice de corrélation

plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')

In [None]:
# Distribution des variables

fig = plt.figure(figsize=(20, 20))
for i in range(len(df_train.columns)):
    fig.add_subplot(5, 3, i+1)
    sns.histplot(df_train.iloc[:, i], color='green', label=df_train.columns[i])
    # show the mean and median
    plt.axvline(df_train.iloc[:, i].mean(), linestyle='dashed', color='red', label='mean')
    plt.axvline(df_train.iloc[:, i].median(), linestyle='dashed', color='blue', label='median')
    plt.legend()

## Séparation train-test

In [4]:
# Séparation des données en train test

X = df_train.drop('target', axis=1)
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# Entrainement du modèle

model.fit(X_train, y_train)

# Prédiction sur les données de test

y_preds = model.predict(X_test)

# Evaluation du modèle

print(f"Le r2 vaut : {r2_score(y_test, y_preds)}")

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/linear_regression.csv', index=False)

# soumission des résultats

model = LinearRegression()

model.fit(X, y)

y_preds = model.predict(df_test.drop('wine_ID', axis=1))

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/linear_regression.csv', index=False)



## CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=10000, depth=10, learning_rate=0.1, loss_function='RMSE', eval_metric='R2', random_seed=42)

# Entrainement du modèle

model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=0)

# Prédiction sur les données de test

y_preds = model.predict(X_test)

# Evaluation du modèle

print(f'Le r2 vaut : {r2_score(y_test, y_preds)}')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/catboost.csv', index=False)

# soumission des résultats

model = CatBoostRegressor(iterations=10000, depth=10, learning_rate=0.1, loss_function='RMSE', eval_metric='R2', random_seed=42)

model.fit(X, y, verbose=0)

y_preds = model.predict(df_test.drop('wine_ID', axis=1))

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/catboost.csv', index=False)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Définition de notre espaces de paramètres

params = {'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10, 15, 20],
            'min_samples_leaf': [1, 2, 5, 10, 15]}

# Création de notre modèle

model = RandomForestRegressor(random_state=42)

# Création de notre grille de recherche

grid = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

# Entrainement de notre modèle

grid.fit(X_train, y_train)

# Affichage des meilleurs paramètres

print(grid.best_params_)
print(grid.best_score_)

# Prédiction sur les données de test

y_preds = grid.predict(X_test)

# Evaluation du modèle

print(f'Le r2 vaut : {r2_score(y_test, y_preds)}')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/random_forest.csv', index=False)

# soumission des résultats

model = RandomForestRegressor(random_state=42)
model.set_params(**grid.best_params_)

model.fit(X, y)

y_preds = model.predict(df_test.drop('wine_ID', axis=1))

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/random_forest.csv', index=False)

## Neural Network

In [5]:
trainset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
testset = TensorDataset(torch.tensor(X_test.values, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.float32))

trainloader = DataLoader(trainset, batch_size=32, shuffle=True)

fullset = TensorDataset(torch.tensor(X.values, dtype=torch.float32), torch.tensor(y.values, dtype=torch.float32))
fullloader = DataLoader(fullset, batch_size=32, shuffle=True)

full_testset = TensorDataset(torch.tensor(df_test.drop('wine_ID', axis=1).values, dtype=torch.float32))

In [10]:
class NeuralNetworkRegressor(nn.Module):
    def __init__(self):
        super(NeuralNetworkRegressor, self).__init__()
        self.fc1 = nn.Linear(12, 6)
        self.fc2 = nn.Linear(6, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
model = NeuralNetworkRegressor()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 1000

for epoch in range(epochs):
    running_loss = 0
    for inputs, labels in trainloader:
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    else:
        test_loss = 0
        accuracy = 0

    with torch.no_grad():
        model.eval()
        y_preds = []
        for inputs, labels in testset:
            outputs = model(inputs)
            y_preds.append(outputs.item())
    print(f"r2 : {r2_score(y_test, y_preds)}")


new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/neural_network.csv', index=False)

r2 : -6.790782354171641
r2 : -1.3876904013280407
r2 : -0.4537691809879181
r2 : -0.11414973950713247
r2 : 0.008500800546965337
r2 : 0.05780434831522718
r2 : 0.10949598764592106
r2 : 0.13903718976330492
r2 : 0.15699748173055839
r2 : 0.1483217385602147
r2 : 0.1776998590520894
r2 : 0.20958914178555588
r2 : 0.1590275784333739
r2 : 0.22613270562681953
r2 : 0.19869800479642608
r2 : 0.2246869062930812
r2 : 0.19012636080216572
r2 : 0.19183292092352566
r2 : 0.24565077270860025
r2 : 0.21808677621820638
r2 : 0.2386808983225175
r2 : 0.25051748713609456
r2 : 0.25580954008607093
r2 : 0.2546438711415089
r2 : 0.23062522802975882
r2 : 0.2506549427236383
r2 : 0.24285989181585121
r2 : 0.2650106324105176
r2 : 0.2566913490121274
r2 : 0.25041184665544747
r2 : 0.2403094111655527
r2 : 0.2678987926068852
r2 : 0.2633170306858298
r2 : 0.1309500644410806
r2 : 0.27317397908959795
r2 : 0.26388070150804377
r2 : 0.26793958383665684
r2 : 0.23433618236627052
r2 : 0.25866543857079694
r2 : 0.1301889240616111
r2 : 0.162421

In [12]:
model = NeuralNetworkRegressor()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 200

for epoch in range(epochs):
    running_loss = 0
    for inputs, labels in fullloader:
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    else:
        test_loss = 0
        accuracy = 0

    print(f"for epoch {epoch} : {running_loss / len(trainloader)}")

with torch.no_grad():
    model.eval()
    y_preds = []
    for inputs in full_testset:
        outputs = model(inputs[0])
        y_preds.append(outputs.item())

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/neural_network.csv', index=False)

for epoch 0 : 226.27088533383665
for epoch 1 : 10.66029664400582
for epoch 2 : 7.673589160509199
for epoch 3 : 6.182035192150936
for epoch 4 : 4.756762839923395
for epoch 5 : 3.8318470003448915
for epoch 6 : 3.10766361138531
for epoch 7 : 2.526102371304949
for epoch 8 : 2.07741824377363
for epoch 9 : 1.7230226508924895
for epoch 10 : 1.4591853691038685
for epoch 11 : 1.2800033228976704
for epoch 12 : 1.1582391766187186
for epoch 13 : 1.0787443357093311
for epoch 14 : 1.025876410653658
for epoch 15 : 0.9907247468689891
for epoch 16 : 0.9700071831172872
for epoch 17 : 0.9523125168319061
for epoch 18 : 0.9490960089959831
for epoch 19 : 0.9260405065300309
for epoch 20 : 0.9140349537412696
for epoch 21 : 0.9182342622324685
for epoch 22 : 0.89075149433844
for epoch 23 : 0.8865977099565702
for epoch 24 : 0.8825172846005341
for epoch 25 : 0.870323657989502
for epoch 26 : 0.8557440256961039
for epoch 27 : 0.848980707821445
for epoch 28 : 0.8534191459695869
for epoch 29 : 0.8269616124507423
for 

## Normalisation des données

In [15]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df_train_scaled = scaler.fit_transform(df_train.drop('target', axis=1))
df_test_scaled = scaler.transform(df_test.drop('wine_ID', axis=1))

## Linear Regression

In [None]:
model = LinearRegression()

# Entrainement du modèle

model.fit(X_train_scaled, y_train)

# Prédiction sur les données de test

y_preds = model.predict(X_test_scaled)

# Evaluation du modèle

print(f"Le r2 vaut : {r2_score(y_test, y_preds)}")

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/linear_regression_scaled.csv', index=False)

# soumission des résultats

model = LinearRegression()

model.fit(df_train_scaled, y)

y_preds = model.predict(df_test_scaled)

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/linear_regression_scaled.csv', index=False)

## CatBoost Regressor

In [None]:
model = CatBoostRegressor(iterations=10000, depth=10, learning_rate=0.1, loss_function='RMSE', eval_metric='R2', random_seed=42)

# Entrainement du modèle

model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), verbose=0)

# Prédiction sur les données de test

y_preds = model.predict(X_test_scaled)

# Evaluation du modèle

print(f'Le r2 vaut : {r2_score(y_test, y_preds)}')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/catboost_scaled.csv', index=False)

# soumission des résultats

model = CatBoostRegressor(iterations=10000, depth=10, learning_rate=0.1, loss_function='RMSE', eval_metric='R2', random_seed=42)

model.fit(df_train_scaled, y, verbose=0)

y_preds = model.predict(df_test_scaled)

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/catboost_scaled.csv', index=False)

## Random Forest Regressor

In [None]:
model = RandomForestRegressor(random_state=42)

# Définition de notre espaces de paramètres

params = {'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_samples_split': [2, 5, 10, 15, 20],
            'min_samples_leaf': [1, 2, 5, 10, 15]}

# Création de notre grille de recherche

grid = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)

# Entrainement de notre modèle

grid.fit(X_train_scaled, y_train)

# Affichage des meilleurs paramètres

print(grid.best_params_)
print(grid.best_score_)

# Prédiction sur les données de test

y_preds = grid.predict(X_test_scaled)

# Evaluation du modèle

print(f'Le r2 vaut : {r2_score(y_test, y_preds)}')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/regression/random_forest_scaled.csv', index=False)

# soumission des résultats

model = RandomForestRegressor(random_state=42)
model.set_params(**grid.best_params_)

model.fit(df_train_scaled, y)

y_preds = model.predict(df_test_scaled)

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds})

submission.to_csv('../soumission/regression/random_forest_scaled.csv', index=False)

## Comme un problème de classification

In [24]:

from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.1,  random_seed=42)

# Entrainement du modèle

model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), verbose=10)

# Prédiction sur les données de test

y_preds = model.predict(X_test_scaled)

# Evaluation du modèle

print(f'Le r2 vaut : {r2_score(y_test, y_preds)}')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds.ravel()})

new_results.to_csv('../résultats_models/regression/catboost_cla_scaled.csv', index=False)

# soumission des résultats

model = CatBoostClassifier(iterations=500, depth=10, learning_rate=0.1, random_seed=42)

model.fit(df_train_scaled, y, verbose=0)

y_preds = model.predict(df_test_scaled)

submission = pd.DataFrame({'wine_ID': df_test['wine_ID'],
                            'target': y_preds.ravel()})

submission.to_csv('../soumission/regression/catboost_cla_scaled.csv', index=False)

0:	learn: 1.8127186	test: 1.8109653	best: 1.8109653 (0)	total: 26.5ms	remaining: 13.2s
10:	learn: 1.2483650	test: 1.2763464	best: 1.2763464 (10)	total: 259ms	remaining: 11.5s
20:	learn: 1.0474611	test: 1.1227839	best: 1.1227839 (20)	total: 490ms	remaining: 11.2s
30:	learn: 0.9365666	test: 1.0505099	best: 1.0505099 (30)	total: 774ms	remaining: 11.7s
40:	learn: 0.8613896	test: 1.0110389	best: 1.0110389 (40)	total: 1s	remaining: 11.2s
50:	learn: 0.8054991	test: 0.9846950	best: 0.9846950 (50)	total: 1.24s	remaining: 10.9s
60:	learn: 0.7577790	test: 0.9680404	best: 0.9680404 (60)	total: 1.47s	remaining: 10.6s
70:	learn: 0.7157625	test: 0.9557640	best: 0.9557640 (70)	total: 1.71s	remaining: 10.3s
80:	learn: 0.6745259	test: 0.9459211	best: 0.9459211 (80)	total: 1.94s	remaining: 10s
90:	learn: 0.6385105	test: 0.9360519	best: 0.9360519 (90)	total: 2.18s	remaining: 9.8s
100:	learn: 0.6075732	test: 0.9316307	best: 0.9316307 (100)	total: 2.41s	remaining: 9.51s
110:	learn: 0.5815233	test: 0.9257934