In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import IsolationForest, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score, confusion_matrix

In [2]:
# 1️⃣ Chargement des données
df = pd.read_csv("creditcard.csv")

  df = pd.read_csv("creditcard.csv")


In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
df.shape

(136742, 31)

In [5]:
# 2️⃣ Exploration des données
df.info() # Vérifier les types de données et les valeurs manquantes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136742 entries, 0 to 136741
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    136742 non-null  int64  
 1   V1      136742 non-null  float64
 2   V2      136742 non-null  float64
 3   V3      136742 non-null  float64
 4   V4      136742 non-null  float64
 5   V5      136742 non-null  float64
 6   V6      136742 non-null  float64
 7   V7      136742 non-null  float64
 8   V8      136742 non-null  float64
 9   V9      136742 non-null  float64
 10  V10     136742 non-null  float64
 11  V11     136742 non-null  float64
 12  V12     136742 non-null  float64
 13  V13     136742 non-null  float64
 14  V14     136742 non-null  float64
 15  V15     136742 non-null  float64
 16  V16     136742 non-null  float64
 17  V17     136742 non-null  float64
 18  V18     136742 non-null  float64
 19  V19     136742 non-null  float64
 20  V20     136742 non-null  float64
 21  V21     13

In [6]:
# 3️⃣ Prétraitement des données

#doublons
df.duplicated().sum()

509

In [7]:
# Supprimer les doublons
df = df.drop_duplicates()

In [8]:
#doublons
df.duplicated().sum()

0

In [9]:
#valeurs manquantes
df.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [10]:
# 3️⃣ Normalisation des variables Time et Amount
scaler = StandardScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

In [14]:
df = df.dropna(subset=['Class'])

In [15]:
# 4️⃣ Séparation des features et de la cible
X = df.drop(columns=["Class"])
y = df["Class"]

In [16]:
# 5️⃣ Gestion du déséquilibre des classes avec SMOTE
X_resampled, y_resampled = SMOTE(sampling_strategy=0.2, random_state=42).fit_resample(X, y)

In [17]:
from collections import Counter
# Vérifier la répartition après SMOTE
print("Répartition avant SMOTE :", Counter(y))
print("Répartition après SMOTE :", Counter(y_resampled))

Répartition avant SMOTE : Counter({0.0: 135974, 1.0: 258})
Répartition après SMOTE : Counter({0.0: 135974, 1.0: 27194})


In [18]:
# 6️⃣ Séparation Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [19]:
# 7️⃣ Initialisation des modèles
models = {
    'Isolation Forest': IsolationForest(n_estimators=100, contamination=0.01, random_state=42),
    'Logistic Regression': LogisticRegression(),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [26]:
# 8️⃣ Entraînement et Évaluation des Modèles
from sklearn.metrics import accuracy_score
results = []

for name, model in models.items():
    if name == "Isolation Forest":
        model.fit(X_train)
        y_pred = model.predict(X_test)
        y_pred = np.where(y_pred == -1, 1, 0)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Précision du modèle {name} : {accuracy:.2f}')
        print(classification_report(y_test, y_pred))
    else:
        X_train['V28'] = pd.to_numeric(X_train['V28'], errors='coerce')
        X_test['V28'] = pd.to_numeric(X_test['V28'], errors='coerce')

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Précision du modèle {name} : {accuracy:.2f}')
        print(classification_report(y_test, y_pred))

Précision du modèle Isolation Forest : 0.84
              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     27153
         1.0       0.96      0.05      0.10      5481

    accuracy                           0.84     32634
   macro avg       0.90      0.52      0.50     32634
weighted avg       0.86      0.84      0.78     32634

Précision du modèle Logistic Regression : 0.97
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     27153
         1.0       0.97      0.87      0.92      5481

    accuracy                           0.97     32634
   macro avg       0.97      0.93      0.95     32634
weighted avg       0.97      0.97      0.97     32634

Précision du modèle HistGradientBoosting : 1.00
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     27153
         1.0       1.00      1.00      1.00      5481

    accuracy                           1.00

Parameters: { "use_label_encoder" } are not used.



Précision du modèle XGBoost : 1.00
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     27153
         1.0       1.00      1.00      1.00      5481

    accuracy                           1.00     32634
   macro avg       1.00      1.00      1.00     32634
weighted avg       1.00      1.00      1.00     32634

