In [None]:
# ===============================================================
# üéØ Projet : Analyse marketing et segmentation client
# Auteur : A√Ødan Boua√Øcha & Omar [Data Cleaning]
# Objectif : Nettoyer et analyser les donn√©es marketing
# ===============================================================

# --- 1. Importation des librairies n√©cessaires -----------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
plt.rcParams["figure.figsize"] = (10,5)
sns.set(style="whitegrid")

# --- 2. Chargement des donn√©es brutes ---------------------------
df = pd.read_csv("Camp_Market_Brut.csv", sep=",")
print("‚úÖ Donn√©es brutes charg√©es :", df.shape)
display(df.head())

# --- 3. Nettoyage des donn√©es (code Omar simplifi√©) -------------
print("\nüßπ Nettoyage des donn√©es en cours...")

# Suppression des doublons
df = df.drop_duplicates()

# Remplacement des valeurs manquantes
df["Income"] = df["Income"].fillna(df["Income"].median())

# Cr√©ation d‚Äôune colonne 'Age' √† partir de l‚Äôann√©e de naissance
df["Age"] = 2025 - df["Year_Birth"]

# Suppression des valeurs incoh√©rentes (revenus n√©gatifs, √¢ges extr√™mes)
df = df[(df["Income"] > 0) & (df["Age"] < 100) & (df["Age"] > 18)]

# Calcul du total d√©pens√© par client
depenses = [col for col in df.columns if "Mnt" in col]
df["TotalSpent"] = df[depenses].sum(axis=1)

# Gestion des colonnes inutiles
cols_to_drop = ["Z_CostContact", "Z_Revenue", "Dt_Customer"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

# Nettoyage des valeurs cat√©gorielles
df["Education"] = df["Education"].replace({
    "2n Cycle": "Graduate", "Basic": "Basic", "PhD": "Postgraduate", 
    "Master": "Postgraduate"
})
df["Marital_Status"] = df["Marital_Status"].replace({
    "Together": "Married", "Alone": "Single", "Absurd": "Single",
    "YOLO": "Single", "Widow": "Single"
})

# Suppression des NaN restants
df = df.dropna()

# Export du fichier nettoy√©
df.to_csv("Camp_Market_cleaned_omar.csv", index=False)
print("‚úÖ Donn√©es nettoy√©es et sauvegard√©es dans 'Camp_Market_cleaned_omar.csv'")
display(df.head())

# --- 4. Analyse exploratoire et visualisation -------------------
print("\nüìä Analyse exploratoire :")

print("\nüìè Dimensions du dataset :", df.shape)
display(df.describe())

sns.histplot(df["Income"], bins=30, kde=True, color="skyblue")
plt.title("Distribution du revenu des clients")
plt.xlabel("Revenu (‚Ç¨)")
plt.ylabel("Nombre de clients")
plt.show()

sns.histplot(df["Age"], bins=30, kde=True, color="orange")
plt.title("Distribution de l'√¢ge des clients")
plt.xlabel("√Çge")
plt.ylabel("Nombre de clients")
plt.show()

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap="coolwarm", center=0, annot=True, fmt=".2f")
plt.title("Corr√©lation entre les variables num√©riques")
plt.show()

# --- 5. Analyse de la r√©ponse aux campagnes ---------------------
if "Response" in df.columns:
    print("\nüéØ Analyse de la r√©ponse aux campagnes :")

    response_rate = df["Response"].mean() * 100
    print(f"Taux de r√©ponse global : {response_rate:.2f}%")

    sns.boxplot(x="Response", y="Income", hue="Response", data=df, palette="Set2", legend=False)
    plt.title("Revenu selon la r√©ponse √† la campagne")
    plt.show()

    sns.boxplot(x="Response", y="Age", hue="Response", data=df, palette="Set3", legend=False)
    plt.title("√Çge selon la r√©ponse √† la campagne")
    plt.show()

# --- 6. Segmentation client (Clustering) ------------------------
print("\nüë• Segmentation client (K-Means) :")

features = ["Income", "Age", "Recency", "TotalSpent", "NumWebPurchases", "NumStorePurchases"]
X = df[features].dropna()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
for k in range(2,8):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_scaled)
    inertia.append(model.inertia_)

plt.plot(range(2,8), inertia, marker='o')
plt.title("M√©thode du coude pour d√©terminer le nombre de clusters")
plt.xlabel("Nombre de clusters")
plt.ylabel("Inertie")
plt.show()

kmeans = KMeans(n_clusters=4, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

sns.pairplot(df, hue="Cluster", vars=["Income", "Age", "TotalSpent", "Recency"])
plt.suptitle("Visualisation des clusters clients", y=1.02)
plt.show()

# --- 7. Mod√©lisation pr√©dictive --------------------------------
print("\nü§ñ Mod√©lisation pr√©dictive :")

X = df.select_dtypes(include=[np.number]).drop(columns=["Response", "Cluster"], errors='ignore')
y = df["Response"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nüìà Matrice de confusion :")
print(confusion_matrix(y_test, y_pred))

print("\nüìä Rapport de classification :")
print(classification_report(y_test, y_pred))

# --- 8. Exportation des r√©sultats -------------------------------
print("\nüíæ Exportation des r√©sultats :")

df["Predicted_Response"] = model.predict(X)
df.to_csv("Camp_Market_Resultats.csv", index=False)
print("‚úÖ 'Camp_Market_Resultats.csv' export√© avec succ√®s.")

# --- 9. Conclusion ----------------------------------------------
print("\nüß† Conclusion :")
print("""
- Le nettoyage des donn√©es a permis d‚Äôobtenir un jeu coh√©rent et pr√™t √† l‚Äôanalyse.
- Les clients √† revenu moyen/√©lev√© r√©pondent davantage aux campagnes.
- Le clustering distingue plusieurs segments de client√®le selon √¢ge, revenu et d√©penses.
- Le mod√®le pr√©dictif aide √† cibler efficacement les futurs clients.
""")
