In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [2]:
# 1️⃣ Chargement des données
df = pd.read_csv("fraud_oracle.csv")

In [3]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [4]:
df.shape

(15420, 33)

In [5]:
# 2️⃣ Exploration des données
df.info() # Vérifier les types de données et les valeurs manquantes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [6]:
df.describe() # Statistiques générales

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0
mean,2.788586,2.693969,39.855707,0.059857,7710.5,8.483268,407.70428,2.487808,1994.866472
std,1.287585,1.259115,13.492377,0.23723,4451.514911,4.599948,43.950998,1.119453,0.803313
min,1.0,1.0,0.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,0.0,3855.75,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,0.0,7710.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,0.0,11565.25,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,1.0,15420.0,16.0,700.0,4.0,1996.0


In [7]:
# 3️⃣ Prétraitement des données

#doublons
df.duplicated().sum()

0

In [8]:
#valeurs manquantes
df.isnull().sum()

Unnamed: 0,0
Month,0
WeekOfMonth,0
DayOfWeek,0
Make,0
AccidentArea,0
DayOfWeekClaimed,0
MonthClaimed,0
WeekOfMonthClaimed,0
Sex,0
MaritalStatus,0


In [9]:
# Liste des variables catégorielles
categorical_cols = [
    "Month", "DayOfWeek", "Make", "AccidentArea", "DayOfWeekClaimed",
    "MonthClaimed", "Sex", "MaritalStatus", "Fault", "PolicyType",
    "VehicleCategory", "VehiclePrice", "Days_Policy_Accident",
    "Days_Policy_Claim", "PastNumberOfClaims", "AgeOfVehicle",
    "AgeOfPolicyHolder", "PoliceReportFiled", "WitnessPresent",
    "AgentType", "NumberOfSuppliments", "AddressChange_Claim",
    "NumberOfCars", "BasePolicy"
]

# One-Hot Encoding pour les variables nominales
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [10]:
df.head()

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Year,Month_Aug,...,AddressChange_Claim_2 to 3 years,AddressChange_Claim_4 to 8 years,AddressChange_Claim_no change,AddressChange_Claim_under 6 months,NumberOfCars_2 vehicles,NumberOfCars_3 to 4,NumberOfCars_5 to 8,NumberOfCars_more than 8,BasePolicy_Collision,BasePolicy_Liability
0,5,1,21,0,1,12,300,1,1994,False,...,False,False,False,False,False,True,False,False,False,True
1,3,4,34,0,2,15,400,4,1994,False,...,False,False,True,False,False,False,False,False,True,False
2,5,2,47,0,3,7,400,3,1994,False,...,False,False,True,False,False,False,False,False,True,False
3,2,1,65,0,4,4,400,2,1994,False,...,False,False,True,False,False,False,False,False,False,True
4,5,2,27,0,5,3,400,1,1994,False,...,False,False,True,False,False,False,False,False,True,False


In [11]:
# 4️⃣ Séparation des features et de la cible
X = df.drop(columns=["FraudFound_P"])
y = df["FraudFound_P"]

In [12]:
from imblearn.over_sampling import SMOTE
# 5️⃣ Gestion du déséquilibre des classes avec SMOTE
X_resampled, y_resampled = SMOTE(sampling_strategy=0.2, random_state=42).fit_resample(X, y)

In [13]:
from collections import Counter
# Vérifier la répartition après SMOTE
print("Répartition avant SMOTE :", Counter(y))
print("Répartition après SMOTE :", Counter(y_resampled))

Répartition avant SMOTE : Counter({0: 14497, 1: 923})
Répartition après SMOTE : Counter({0: 14497, 1: 2899})


In [14]:
# Standardisation des variables (nécessaire pour KNN et SVM)
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

In [15]:
# 6️⃣ Séparation Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [22]:
# Import des modèles
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Définition des modèles
algos = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'XGBClassifier': XGBClassifier(),
    'Adaboost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB()
}

In [23]:
# Entraînement et évaluation des modèles
results = {}


for name, model in algos.items():
    print(f"\n🔹 Entraînement du modèle : {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calcul des métriques
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    report = classification_report(y_test, y_pred)
    results[name] = accuracy  # Store accuracy in the results dictionary
    print(report)


🔹 Entraînement du modèle : LogisticRegression
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      2906
           1       0.98      0.65      0.78       574

    accuracy                           0.94      3480
   macro avg       0.96      0.82      0.87      3480
weighted avg       0.94      0.94      0.93      3480


🔹 Entraînement du modèle : RandomForestClassifier
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      2906
           1       1.00      0.64      0.78       574

    accuracy                           0.94      3480
   macro avg       0.97      0.82      0.87      3480
weighted avg       0.94      0.94      0.93      3480


🔹 Entraînement du modèle : GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      2906
           1       0.99      0.64      0.77       574

    accuracy                 

In [26]:
sorted_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
print("\nAccuracy des modèles (du plus précis au moins précis):")
results_df = pd.DataFrame(list(sorted_results.items()), columns=['Model', 'Accuracy'])
results_df



Accuracy des modèles (du plus précis au moins précis):


Unnamed: 0,Model,Accuracy
0,XGBClassifier,0.95546
1,ExtraTreesClassifier,0.940805
2,SVM,0.94023
3,LogisticRegression,0.939943
4,RandomForestClassifier,0.939943
5,GradientBoostingClassifier,0.938793
6,Adaboost,0.909195
7,KNeighborsClassifier,0.89569
8,NaiveBayes,0.304885
