In [None]:
# ===============================================================
# üéØ Projet : Analyse marketing et segmentation client
# Auteur : A√Ødan Boua√Øcha
# Objectif : Reproduire le nettoyage d‚ÄôOmar + analyse compl√®te
# ===============================================================

# --- 1. Importation des librairies n√©cessaires -----------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Configuration d'affichage
pd.set_option("display.max_columns", None)
plt.rcParams["figure.figsize"] = (10,5)
sns.set(style="whitegrid")

# --- 2. Chargement et nettoyage des donn√©es (fa√ßon Omar) -------
print("üì• Chargement du fichier brut...")
df = pd.read_csv("Camp_Market_Brut.csv", sep=";")

# Suppression des colonnes inutiles
cols_to_drop = ["Z_CostContact", "Z_Revenue"]
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

# Conversion de la date client
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], errors="coerce", dayfirst=True)

# Ajout des variables d√©riv√©es fa√ßon Omar
today = datetime(2025, 10, 16)  # m√™me r√©f√©rence que lui
df["Age"] = 2025 - df["Year_Birth"]

df["TotalSpent"] = df[[
    "MntWines", "MntFruits", "MntMeatProducts",
    "MntFishProducts", "MntSweetProducts", "MntGoldProds"
]].sum(axis=1)

df["TotalPurchases"] = df[[
    "NumDealsPurchases", "NumWebPurchases",
    "NumCatalogPurchases", "NumStorePurchases"
]].sum(axis=1)

df["TotalKids"] = df["Kidhome"] + df["Teenhome"]

df["AcceptedCmp_total"] = df[[
    "AcceptedCmp1", "AcceptedCmp2",
    "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5"
]].sum(axis=1)

df["AvgSpentPerPurchase"] = df.apply(
    lambda r: r["TotalSpent"]/r["TotalPurchases"] if r["TotalPurchases"] > 0 else 0,
    axis=1
)

df["DaySinceCustomer"] = (today - df["Dt_Customer"]).dt.days

# V√©rification
print("‚úÖ Donn√©es nettoy√©es comme Omar :")
display(df.head())

print("\nüìè Dimensions du dataset :", df.shape)
print("\nüß± Colonnes disponibles :")
print(df.columns.tolist())

# --- 3. Analyse exploratoire des donn√©es ------------------------
print("\nüìä Statistiques g√©n√©rales :")
display(df.describe())

# Histogramme du revenu
sns.histplot(df["Income"], bins=30, kde=True, color="skyblue")
plt.title("Distribution du revenu des clients")
plt.xlabel("Revenu (‚Ç¨)")
plt.ylabel("Nombre de clients")
plt.show()

# Histogramme de l'√¢ge
sns.histplot(df["Age"], bins=30, kde=True, color="orange")
plt.title("Distribution de l'√¢ge des clients")
plt.xlabel("√Çge")
plt.ylabel("Nombre de clients")
plt.show()

# Heatmap des corr√©lations
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap="coolwarm", center=0, annot=True, fmt=".2f")
plt.title("Corr√©lation entre les variables num√©riques")
plt.show()

# --- 4. Analyse de la r√©ponse aux campagnes ---------------------
if "Response" in df.columns:
    print("\nüéØ Analyse de la r√©ponse aux campagnes :")

    response_rate = df["Response"].mean() * 100
    print(f"Taux de r√©ponse global : {response_rate:.2f}%")

    sns.boxplot(x="Response", y="Income", data=df, hue="Response", palette="Set2", legend=False)
    plt.title("Revenu selon la r√©ponse √† la campagne")
    plt.show()

    sns.boxplot(x="Response", y="Age", data=df, hue="Response", palette="Set3", legend=False)
    plt.title("√Çge selon la r√©ponse √† la campagne")
    plt.show()

# --- 5. Ajout de variables suppl√©mentaires ----------------------
print("\nüßÆ Cr√©ation de variables suppl√©mentaires :")

# Ratio des achats web vs magasin
if "NumWebPurchases" in df.columns and "NumStorePurchases" in df.columns:
    df["WebPurchaseRatio"] = df["NumWebPurchases"] / (df["NumWebPurchases"] + df["NumStorePurchases"] + 1)

print("‚úÖ Variables ajout√©es : ['WebPurchaseRatio']")

# --- 6. Segmentation client (Clustering) ------------------------
print("\nüë• Segmentation client (K-Means) :")

features = ["Income", "Age", "Recency", "TotalSpent", "NumWebPurchases", "NumStorePurchases"]
X = df[features].dropna()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# M√©thode du coude
inertia = []
for k in range(2,8):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_scaled)
    inertia.append(model.inertia_)

plt.plot(range(2,8), inertia, marker='o')
plt.title("M√©thode du coude pour d√©terminer le nombre optimal de clusters")
plt.xlabel("Nombre de clusters")
plt.ylabel("Inertie")
plt.show()

# Application du mod√®le K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

sns.pairplot(df, hue="Cluster", vars=["Income", "Age", "TotalSpent", "Recency"])
plt.suptitle("Visualisation des clusters clients", y=1.02)
plt.show()

# --- 7. Mod√©lisation pr√©dictive : R√©ponse client ---------------
print("\nü§ñ Mod√©lisation pr√©dictive :")

X = df.select_dtypes(include=[np.number]).drop(columns=["Response", "Cluster"], errors='ignore')
y = df["Response"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nüìà Matrice de confusion :")
print(confusion_matrix(y_test, y_pred))

print("\nüìä Rapport de classification :")
print(classification_report(y_test, y_pred))

# --- 8. Exportation des r√©sultats -------------------------------
print("\nüíæ Exportation des r√©sultats :")

df["Predicted_Response"] = model.predict(X)
df.to_csv("Camp_Market_Resultats.csv", index=False)
print("‚úÖ Fichier 'Camp_Market_Resultats.csv' export√© avec succ√®s.")

cluster_summary = df.groupby("Cluster")[["Income", "Age", "TotalSpent", "Recency"]].mean()
cluster_summary.to_csv("Cluster_Stats.csv")
print("‚úÖ Fichier 'Cluster_Stats.csv' export√© avec succ√®s.")

results = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
results.to_csv("Predictions.csv", index=False)
print("‚úÖ Fichier 'Predictions.csv' export√© avec succ√®s.")

# --- 9. Conclusion ----------------------------------------------
print("\nüß† Conclusion :")
print("""
- Les campagnes marketing performent mieux chez les clients r√©cents et √† revenu moyen/√©lev√©.
- Le clustering a permis d'identifier plusieurs segments distincts selon le revenu et les habitudes d'achat.
- Le mod√®le pr√©dictif permet d'anticiper la r√©ponse client avec une pr√©cision satisfaisante.
- Ces insights peuvent orienter de futures campagnes cibl√©es.
""")

