In [17]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


# 1. Chargement des fichiers

In [18]:
transactions_train = pd.read_csv("transactions_train.csv")
cards_data = pd.read_csv("cards_data.csv")
users_data = pd.read_csv("users_data.csv")

with open("train_fraud_labels.json", "r") as f:
    labels_json = json.load(f)
labels_dict = labels_json.get("target", labels_json)
labels_df = pd.DataFrame(list(labels_dict.items()), columns=["transaction_id", "fraud_label"])

with open("mcc_codes.json", "r") as f:
    mcc_codes = json.load(f)
mcc_df = pd.DataFrame(list(mcc_codes.items()), columns=["mcc", "mcc_description"])




# 2. Nettoyage de base

In [19]:
def clean_dollar(x):
    if isinstance(x, str):
        return float(x.replace("$", "").replace(",", ""))
    return x

# Transactions
transactions_train["amount"] = transactions_train["amount"].apply(clean_dollar)
transactions_train["zip"] = transactions_train["zip"].fillna(0).astype(int)
transactions_train["mcc"] = transactions_train["mcc"].astype(str)

# Labels
labels_df["transaction_id"] = labels_df["transaction_id"].astype(int)
labels_df["fraud_label"] = labels_df["fraud_label"].map({"Yes": 1, "No": 0})

# Cartes
cards_data["credit_limit"] = cards_data["credit_limit"].apply(clean_dollar)
cards_data["acct_open_date"] = pd.to_datetime(cards_data["acct_open_date"], errors="coerce")

# Utilisateurs
users_data["yearly_income"] = users_data["yearly_income"].apply(clean_dollar)
users_data["total_debt"] = users_data["total_debt"].apply(clean_dollar)
users_data["per_capita_income"] = users_data["per_capita_income"].apply(clean_dollar)


  cards_data["acct_open_date"] = pd.to_datetime(cards_data["acct_open_date"], errors="coerce")


# 3. Fusion des datasets

In [21]:
train_merged = transactions_train.merge(labels_df, on="transaction_id", how="left")

merged = (
    train_merged
    .merge(cards_data, left_on="card_id", right_on="id", how="left", suffixes=("", "_card"))
    .merge(users_data, left_on="client_id", right_on="id", how="left", suffixes=("", "_user"))
    .merge(mcc_df, on="mcc", how="left")
)

print("Fusion réussie :", merged.shape)

Fusion réussie : (210000, 41)


# 4. Préparation pour Machine Learning

In [22]:
# On supprime les colonnes inutiles ou non exploitables
drop_cols = [
    "transaction_id", "date", "merchant_city", "merchant_state",
    "address", "card_number", "acct_open_date", "id", "id_card", "id_user"
]
data = merged.drop(columns=[c for c in drop_cols if c in merged.columns], errors="ignore")

# Encodage des variables catégorielles
cat_cols = data.select_dtypes(include=["object"]).columns
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Suppression des lignes sans label
data = data.dropna(subset=["fraud_label"])

# Séparation X / y
X = data.drop(columns=["fraud_label"])
y = data["fraud_label"].astype(int)

# Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Dataset prêt pour entraînement")
print("X_train :", X_train.shape, "| y_train :", y_train.shape)
print("Taux de fraude dans le train :", round(y_train.mean()*100, 5), "%")


✅ Dataset prêt pour entraînement
X_train : (168000, 31) | y_train : (168000,)
Taux de fraude dans le train : 0.15 %


## Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- Entraînement ---
log_reg = LogisticRegression(
    class_weight='balanced',  # corrige le déséquilibre des classes
    max_iter=500,
    solver='liblinear',       # bon pour datasets déséquilibrés
    random_state=42
)

log_reg.fit(X_train, y_train)

# --- Prédictions ---
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# --- Évaluation ---
print("✅ Évaluation du modèle Logistique")
print("Matrice de confusion :\n", confusion_matrix(y_test, y_pred))
print("\nRapport de classification :\n", classification_report(y_test, y_pred, digits=4))
print("ROC-AUC :", round(roc_auc_score(y_test, y_proba), 5))

✅ Évaluation du modèle Logistique
Matrice de confusion :
 [[36301  5636]
 [   17    46]]

Rapport de classification :
               precision    recall  f1-score   support

           0     0.9995    0.8656    0.9278     41937
           1     0.0081    0.7302    0.0160        63

    accuracy                         0.8654     42000
   macro avg     0.5038    0.7979    0.4719     42000
weighted avg     0.9980    0.8654    0.9264     42000

ROC-AUC : 0.88729


## XGBoost Classifier

In [25]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- Entraînement ---
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=50,  # car très peu de fraudes (~0.15%)
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# --- Évaluation ---
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("✅ Évaluation du modèle")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC-AUC :", round(roc_auc_score(y_test, y_proba), 5))


✅ Évaluation du modèle
[[41918    19]
 [   37    26]]
              precision    recall  f1-score   support

           0     0.9991    0.9995    0.9993     41937
           1     0.5778    0.4127    0.4815        63

    accuracy                         0.9987     42000
   macro avg     0.7884    0.7061    0.7404     42000
weighted avg     0.9985    0.9987    0.9986     42000

ROC-AUC : 0.96362


## CatBoost Classifier

In [26]:
#!pip install catboost

In [24]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- Entraînement ---
cat_model = CatBoostClassifier(
    iterations=1000,            # nombre d'arbres
    depth=8,                    # profondeur de chaque arbre
    learning_rate=0.05,         # taux d'apprentissage
    loss_function='Logloss',
    eval_metric='AUC',
    class_weights=[1, 500],     # fort déséquilibre de classes (~0.15%)
    random_seed=42,
    verbose=200,                # affiche la progression
    task_type="CPU"             # passe à "GPU" si tu en as un
)

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# --- Évaluation ---
y_pred = cat_model.predict(X_test)
y_proba = cat_model.predict_proba(X_test)[:, 1]

print("\n✅ Évaluation du modèle CatBoost")
print("Matrice de confusion :\n", confusion_matrix(y_test, y_pred))
print("\nRapport de classification :\n", classification_report(y_test, y_pred, digits=4))
print("ROC-AUC :", round(roc_auc_score(y_test, y_proba), 5))


0:	test: 0.9141318	best: 0.9141318 (0)	total: 81.6ms	remaining: 1m 21s
200:	test: 0.9664096	best: 0.9667975 (187)	total: 19.7s	remaining: 1m 18s
400:	test: 0.9710212	best: 0.9710352 (352)	total: 35.9s	remaining: 53.6s
600:	test: 0.9710212	best: 0.9710352 (352)	total: 49s	remaining: 32.5s
800:	test: 0.9710212	best: 0.9710352 (352)	total: 1m 2s	remaining: 15.5s
999:	test: 0.9710212	best: 0.9710352 (352)	total: 1m 15s	remaining: 0us

bestTest = 0.971035162
bestIteration = 352

Shrink model to first 353 iterations.

✅ Évaluation du modèle CatBoost
Matrice de confusion :
 [[41880    57]
 [   36    27]]

Rapport de classification :
               precision    recall  f1-score   support

           0     0.9991    0.9986    0.9989     41937
           1     0.3214    0.4286    0.3673        63

    accuracy                         0.9978     42000
   macro avg     0.6603    0.7136    0.6831     42000
weighted avg     0.9981    0.9978    0.9979     42000

ROC-AUC : 0.97104
