In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

### Import des données

In [2]:
# Import des données
df_oz = pd.read_csv('../Data-20241001/ozone_complet.txt', delimiter=';')
df_oz = df_oz.drop("maxO3v", axis=1).dropna()
df_oz.head()

Unnamed: 0,maxO3,T6,T9,T12,T15,T18,Ne6,Ne9,Ne12,Ne15,...,Vvit6,Vdir9,Vvit9,Vdir12,Vvit12,Vdir15,Vvit15,Vdir18,Vvit18,Vx
19950401,47.6,10.1,11.6,13.3,13.6,12.2,8.0,8.0,8.0,8.0,...,2.0,290.0,4.0,300.0,4.0,340.0,4.0,20.0,4.0,-3.4641
19950402,56.2,9.5,9.4,13.8,17.4,16.3,8.0,8.0,7.0,0.0,...,2.0,160.0,2.0,180.0,3.0,110.0,1.0,350.0,2.0,0.0
19950403,61.8,3.6,8.0,16.8,21.5,20.2,4.0,5.0,2.0,2.0,...,2.0,20.0,2.0,340.0,1.0,170.0,2.0,170.0,3.0,-0.342
19950404,50.8,9.5,10.5,11.4,12.2,11.4,8.0,7.0,7.0,7.0,...,3.0,10.0,4.0,350.0,3.0,350.0,3.0,350.0,4.0,-0.5209
19950405,59.8,9.8,10.8,13.8,14.3,13.3,8.0,7.0,8.0,8.0,...,2.0,340.0,2.0,280.0,1.0,320.0,3.0,350.0,4.0,-0.9848


In [3]:
def train_test_split_fr(df, target, sizetrain):
    n = int(len(df)*sizetrain)
    train = df[:n]
    test = df[n:]

    X_train = train.drop(target, axis=1)
    Y_train = train[target].copy()
    X_test = test.drop(target,axis=1)
    Y_test = test[target].copy()
    return X_train, Y_train, X_test, Y_test

In [4]:
X_train, y_train, X_test, y_test = train_test_split_fr(df_oz, "maxO3", 0.8)
X_train.T15

19950401    13.6
19950402    17.4
19950403    21.5
19950404    12.2
19950405    14.3
            ... 
20010615    19.4
20010616    18.0
20010617    16.6
20010618    21.5
20010620    26.9
Name: T15, Length: 1108, dtype: float64

### Création de la classe from Scratch de SVM Régression

In [5]:
# Initialisation de la classe SVM Régression from Scratch
class SVR_from_Scratch:
    def __init__(self, learning_rate, lambda_param, n_iters, epsilon):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.epsilon = epsilon
        self.w = None
        self.b = None

    def fit_fr(self, X, y):
        # Nombre d'échantillons et de caractéristiques
        n_samples, n_features = X.shape

        # Initialise les poids
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                # Vérifie si l'échantillon dépasse la marge epsilon
                condition = abs(np.dot(x_i, self.w) - self.b - y[idx]) >= self.epsilon
                if condition:
                    if np.dot(x_i, self.w) - self.b < y[idx]:
                        # Mise à jour des poids et biais pour le cas d'erreur positive
                        self.w -= self.lr * (2 * self.lambda_param * self.w - x_i)
                        self.b -= self.lr * (-1)
                    else:
                        # Mise à jour des poids et biais pour le cas d'erreur négative
                        self.w -= self.lr * (2 * self.lambda_param * self.w + x_i)
                        self.b -= self.lr * (1)

    def predict_fr(self, X):
        return np.dot(X, self.w) + self.b

### Test From Scratch

In [6]:
# Test From Scratch
from sklearn.model_selection import train_test_split

# Récupérer les données
X = df_oz.drop(columns=['maxO3']).values  # Toutes les colonnes sauf 'maxO3'
y = df_oz['maxO3'].values  # Colonne cible 'maxO3'
    
# Split des jeux de données    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

start_time = time.time()
   
# Initialiser et entraîner le modèle SVR
svr_fr = SVR_from_Scratch(learning_rate=0.0001, lambda_param=0.001, n_iters=500, epsilon=0.001)

# Entraîner le modèle sur l'ensemble des features
svr_fr.fit_fr(X_train, y_train)

tot_time_fr = time.time() - start_time

# Prédictions sur l'ensemble de test
y_pred_fr = svr_fr.predict_fr(X_test)

### Test Scikit-Learn

In [7]:
# Comparaison avec scikit-learn
from sklearn.svm import SVR 
from sklearn.preprocessing import StandardScaler

# Normaliser
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

start_time = time.time()
# create an SVR model with a linear kernel 
svr_sl = SVR(kernel="linear") 

# train the model on the data 
svr_sl.fit(X_train_scaler, y_train)

tot_time_sl = time.time() - start_time

# make predictions on the data 
y_pred_sl = svr_sl.predict(X_test_scaler)

### Evaluations

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

# Calcul de la MAE, MSE, RMSE pour From Scratch et Scikit
mae_fr = mean_absolute_error(y_test, y_pred_fr)
mse_fr = mean_squared_error(y_test, y_pred_fr)
rmse_fr = math.sqrt(mse_fr)

# Calcul de la MAE, MSE, RMSE pour Scikit
mae_sl = mean_absolute_error(y_test, y_pred_sl)
mse_sl = mean_squared_error(y_test, y_pred_sl)
rmse_sl = math.sqrt(mse_sl)

# Affichage des résulats performances et temps d'exécution
print(f"Temps d'execution scikit-learn : {tot_time_sl:.4f}s - MAE scikit-learn : {mae_sl:.4f} - MSE scikit-learn : {mse_sl:.4f} - RMSE scikit-learn : {rmse_sl:.4f}")
print(f"Temps d'execution From Scratch : {tot_time_fr:.4f}s - MAE from Scratch: {mae_fr:.4f} - MSE from Scratch: {mse_fr:.4f} - RMSE from Scratch: {rmse_fr:.4f}")

Temps d'execution scikit-learn : 0.0769s - MAE scikit-learn : 13.0513 - MSE scikit-learn : 283.4976 - RMSE scikit-learn : 16.8374
Temps d'execution From Scratch : 5.7005s - MAE from Scratch: 14.3970 - MSE from Scratch: 356.1972 - RMSE from Scratch: 18.8732
