In [120]:
# importation de quelques librairies

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

# importation des librairies pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [92]:
# chemin de nos données d'entrainement et de test

train_path = '../data/data_cla/stars_train_new.csv' # replace with your path
test_path = '../data/data_cla/stars_test_new.csv' # replace with your path

# chargement des données

df_train = pd.read_csv(train_path).drop('obj_ID', axis=1)
df_test = pd.read_csv(test_path)

## Analyse des données

In [None]:
# visualisation des données

df_train.head()

In [None]:
# informations sur les données

df_train.info()

In [None]:
# description des données

df_train.describe()

In [None]:
# forme des données

print(df_train.shape, df_test.shape)

In [None]:
# Disrtibution des classes

df_train['label'].hist()
plt.title('Star Class')
plt.xlabel('Class')
plt.xticks([0,1,2])

In [None]:
# Matrice de corrélation

plt.figure(figsize=(10, 10))
sns.heatmap(df_train.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')

In [None]:
# Distribution des variables

fig = plt.figure(figsize=(20, 20))
for i in range(len(df_train.columns)):
    fig.add_subplot(5, 3, i+1)
    sns.histplot(df_train.iloc[:, i], color='green', label=df_train.columns[i])
    # show the mean and median
    plt.axvline(df_train.iloc[:, i].mean(), linestyle='dashed', color='red', label='mean')
    plt.axvline(df_train.iloc[:, i].median(), linestyle='dashed', color='blue', label='median')
    plt.legend()

## Séparation train-test

In [140]:
# Séparation des données en train test

X = df_train.drop('label', axis=1)
y = df_train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

# Définition de notre espaces de paramètres

params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# Recherche des meilleurs paramètres
grid = GridSearchCV(rf, params, cv=5, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

print(f"paramètres optimaux : {grid.best_params_}")

print(f"meilleur score : {grid.best_score_}")

# Prédiction sur les données de test

y_preds = grid.predict(X_test)

# Matrice de confusion

sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, cmap='coolwarm')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/classif/random_forest.csv', index=False)

# soumission des résultats

rf = RandomForestClassifier(**grid.best_params_)

rf.fit(df_train.drop('label', axis=1), df_train['label'])

y_preds = rf.predict(df_test.drop('obj_ID', axis=1))

submission = pd.DataFrame({'obj_ID': df_test['obj_ID'],
                            'label': y_preds})

submission.to_csv('../soumission/classif/random_forest.csv', index=False)

## CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier

# Définition de notre modèle

cat = CatBoostClassifier(n_estimators=3000, max_depth=10, learning_rate=0.01)

# Entrainement du modèle

cat.fit(X_train, y_train, verbose=1)

# Prédiction sur les données de test

y_preds = cat.predict(X_test)

# Matrice de confusion

sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, cmap='coolwarm')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds.ravel()})

new_results.to_csv('../résultats_models/classif/catboost.csv', index=False)

# soumission des résultats

cat = CatBoostClassifier(n_estimators=3000, max_depth=10, learning_rate=0.01)

cat.fit(df_train.drop('label', axis=1), df_train['label'], verbose=1)

y_preds = cat.predict(df_test.drop('obj_ID', axis=1))

submission = pd.DataFrame({'obj_ID': df_test['obj_ID'],
                            'label': y_preds.ravel()})

submission.to_csv('../soumission/classif/catboost.csv', index=False)


## Neural Network

In [167]:
trainset = TensorDataset(torch.from_numpy(X_train.values).float(), torch.from_numpy(y_train.values).long())
testset = TensorDataset(torch.from_numpy(X_test.values).float(), torch.from_numpy(y_test.values).long())

trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
testloader = DataLoader(testset, batch_size=32, shuffle=False)

fullset = TensorDataset(torch.from_numpy(df_train.drop('label', axis=1).values).float(), torch.from_numpy(df_train['label'].values).long())
fullset_loder = DataLoader(fullset, batch_size=32, shuffle=True)
full_testset = TensorDataset(torch.from_numpy(df_test.drop('obj_ID', axis=1).values).float())

In [168]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    

model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [169]:
for epoch in range(15):
    for i, (inputs, labels) in enumerate(trainloader):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in testloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
        acc = 100.0 * n_correct / n_samples
        print(f'epoch {epoch+1}/{15}, loss={loss.item():.4f} accuracy = {acc}')

# Prédiction sur les données de test
y_preds = []
with torch.no_grad():
    for inputs in testset:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 0)
        y_preds.append(predicted.item())

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/classif/neural_net.csv', index=False)



epoch 1/15, loss=0.4296 accuracy = 74.55254703992657
epoch 2/15, loss=0.5321 accuracy = 74.39192290041304
epoch 3/15, loss=0.5732 accuracy = 75.47039926571821
epoch 4/15, loss=0.4148 accuracy = 77.89505889551782
epoch 5/15, loss=0.4128 accuracy = 86.87471317117944
epoch 6/15, loss=0.4916 accuracy = 89.76594768242313
epoch 7/15, loss=0.1054 accuracy = 91.00504818724185
epoch 8/15, loss=0.3282 accuracy = 93.4220590484932
epoch 9/15, loss=0.2149 accuracy = 93.55208811381368
epoch 10/15, loss=0.2386 accuracy = 94.00336545816124
epoch 11/15, loss=0.1587 accuracy = 93.90393146703381
epoch 12/15, loss=0.0789 accuracy = 94.70705216460149
epoch 13/15, loss=0.2625 accuracy = 94.85237876701851
epoch 14/15, loss=0.1731 accuracy = 94.72234970169802
epoch 15/15, loss=0.2286 accuracy = 95.12773443475601


In [171]:
model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

for epoch in range(30):
    for i, (inputs, labels) in enumerate(fullset_loder):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in testloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
        acc = 100.0 * n_correct / n_samples
        print(f'epoch {epoch+1}/{15}, loss={loss.item():.4f} accuracy = {acc}')

with torch.no_grad():
    y_preds = []
    for inputs in full_testset:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 0)
        y_preds.append(predicted.item())

submission = pd.DataFrame({'obj_ID': df_test['obj_ID'],
                            'label': y_preds})

submission.to_csv('../soumission/classif/neural_net.csv', index=False)

epoch 1/15, loss=0.8666 accuracy = 72.29616031818877
epoch 2/15, loss=0.9498 accuracy = 74.7590637907297
epoch 3/15, loss=0.4636 accuracy = 78.65228698179592
epoch 4/15, loss=0.3638 accuracy = 89.29172403243078
epoch 5/15, loss=0.2998 accuracy = 88.66452501147316
epoch 6/15, loss=0.1403 accuracy = 90.33960532354291
epoch 7/15, loss=0.1673 accuracy = 89.15404619856203
epoch 8/15, loss=0.3662 accuracy = 93.17729845494875
epoch 9/15, loss=0.5687 accuracy = 90.91326296466269
epoch 10/15, loss=0.0840 accuracy = 93.80449747590637
epoch 11/15, loss=0.2790 accuracy = 94.27107235735046
epoch 12/15, loss=0.3162 accuracy = 92.65718219366681
epoch 13/15, loss=0.2782 accuracy = 93.94217530977512
epoch 14/15, loss=0.2695 accuracy = 93.34557136301055
epoch 15/15, loss=0.1629 accuracy = 92.74896741624599
epoch 16/15, loss=0.4664 accuracy = 94.42404772831574
epoch 17/15, loss=0.0621 accuracy = 94.24047728315742
epoch 18/15, loss=0.7192 accuracy = 93.8427413186477
epoch 19/15, loss=0.0985 accuracy = 94.

## Normalisation des données

In [94]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df_train_scaled = scaler.fit_transform(df_train.drop('label', axis=1))
df_test_scaled = scaler.transform(df_test.drop('obj_ID', axis=1))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

# Définition de notre espaces de paramètres

params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

# Recherche des meilleurs paramètres
grid = GridSearchCV(rf, params, cv=5, n_jobs=-1, verbose=1)

grid.fit(X_train_scaled, y_train)

print(f"paramètres optimaux : {grid.best_params_}")

print(f"meilleur score : {grid.best_score_}")

# Prédiction sur les données de test

y_preds = grid.predict(X_test_scaled)

# Matrice de confusion

sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, cmap='coolwarm')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/classif/random_forest_scaled.csv', index=False)

# soumission des résultats

rf = RandomForestClassifier(**grid.best_params_)

rf.fit(df_train_scaled.drop('label', axis=1), df_train_scaled['label'])

y_preds = rf.predict(df_test_scaled.drop('obj_ID', axis=1))

submission = pd.DataFrame({'obj_ID': df_test_scaled['obj_ID'],
                            'label': y_preds})

submission.to_csv('../soumission/classif/random_forest_scaled.csv', index=False)

In [None]:
from catboost import CatBoostClassifier

# Définition de notre modèle

cat = CatBoostClassifier(n_estimators=3000, max_depth=10, learning_rate=0.01)

# Entrainement du modèle

cat.fit(X_train_scaled, y_train, verbose=1)

# Prédiction sur les données de test

y_preds = cat.predict(X_test_scaled)

# Matrice de confusion

sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, cmap='coolwarm')

# Sauvegarde des résultats pour streamlit

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds.ravel()})

new_results.to_csv('../résultats_models/classif/catboost_scaled.csv', index=False)

# soumission des résultats

cat = CatBoostClassifier(n_estimators=3000, max_depth=10, learning_rate=0.01)

cat.fit(df_train_scaled.drop('label', axis=1), df_train_scaled['label'], verbose=1)

y_preds = cat.predict(df_test_scaled.drop('obj_ID', axis=1))

submission = pd.DataFrame({'obj_ID': df_test_scaled['obj_ID'],
                            'label': y_preds.ravel()})

submission.to_csv('../soumission/classif/catboost_scaled.csv', index=False)


## Neural Network

In [103]:
trainset = TensorDataset(torch.from_numpy(X_train_scaled).float(), torch.from_numpy(y_train.values).long())
testset = TensorDataset(torch.from_numpy(X_test_scaled).float(), torch.from_numpy(y_test.values).long())

trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
testloader = DataLoader(testset, batch_size=32, shuffle=False)

fullset = TensorDataset(torch.from_numpy(df_train_scaled).float(), torch.from_numpy(df_train['label'].values).long())
fullset_loder = DataLoader(fullset, batch_size=32, shuffle=True)
full_testset = TensorDataset(torch.from_numpy(df_test_scaled).float())

In [100]:
class NeuralNetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    

model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [118]:
for epoch in range(15):
    for i, (inputs, labels) in enumerate(trainloader):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in testloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
        acc = 100.0 * n_correct / n_samples
        print(f'epoch {epoch+1}/{15}, loss={loss.item():.4f} accuracy = {acc}')

# Prédiction sur les données de test
y_preds = []
with torch.no_grad():
    for inputs in testset:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 0)
        y_preds.append(predicted.item())

new_results = pd.DataFrame({'y_test': y_test,
                            'y_preds': y_preds})

new_results.to_csv('../résultats_models/classif/neural_net_scaled.csv', index=False)



epoch 1/14, loss=0.2835 accuracy = 96.40507878231605
epoch 2/14, loss=0.0610 accuracy = 96.15266942022335
epoch 3/14, loss=0.0199 accuracy = 96.30564479118861
epoch 4/14, loss=0.0802 accuracy = 96.2674009484473
epoch 5/14, loss=0.0935 accuracy = 96.38213247667126
epoch 6/14, loss=0.2010 accuracy = 96.34388863392994
epoch 7/14, loss=0.2855 accuracy = 96.47391769925042
epoch 8/14, loss=0.1567 accuracy = 96.44332262505736
epoch 9/14, loss=0.1994 accuracy = 96.20621080006119
epoch 10/14, loss=0.0296 accuracy = 96.31329355973688
epoch 11/14, loss=0.1331 accuracy = 96.31329355973688
epoch 12/14, loss=0.1300 accuracy = 96.48921523634695
epoch 13/14, loss=0.0975 accuracy = 96.16796695731988
epoch 14/14, loss=0.0357 accuracy = 96.48921523634695
epoch 15/14, loss=0.1554 accuracy = 96.3285910968334


In [119]:
model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

model = NeuralNetClassifier(input_size=8, hidden_size=5, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

for epoch in range(15):
    for i, (inputs, labels) in enumerate(fullset_loder):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in testloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
        acc = 100.0 * n_correct / n_samples
        print(f'epoch {epoch+1}/{15}, loss={loss.item():.4f} accuracy = {acc}')

with torch.no_grad():
    y_preds = []
    for inputs in full_testset:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 0)
        y_preds.append(predicted.item())

submission = pd.DataFrame({'obj_ID': df_test['obj_ID'],
                            'label': y_preds})

submission.to_csv('../soumission/classif/neural_net_scaled.csv', index=False)

epoch 1/100, loss=0.8471 accuracy = 80.79394217530978
epoch 2/100, loss=0.2325 accuracy = 91.90760287593697
epoch 3/100, loss=0.1077 accuracy = 93.38381520575187
epoch 4/100, loss=0.2807 accuracy = 94.18693590331956
epoch 5/100, loss=0.2731 accuracy = 94.50053541379837
epoch 6/100, loss=1.1107 accuracy = 95.02065167508032
epoch 7/100, loss=0.0281 accuracy = 95.42603640813829
epoch 8/100, loss=0.6470 accuracy = 95.4872265565244
epoch 9/100, loss=0.3604 accuracy = 95.45663148233135
epoch 10/100, loss=0.2084 accuracy = 95.61725562184488
epoch 11/100, loss=1.6747 accuracy = 95.60960685329663
epoch 12/100, loss=0.0129 accuracy = 95.51782163071745
epoch 13/100, loss=0.0660 accuracy = 95.57901177910357
epoch 14/100, loss=0.0382 accuracy = 95.78552852990668
epoch 15/100, loss=0.0585 accuracy = 95.5713630105553
