In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score
from sklearn.metrics import roc_curve, roc_auc_score,auc, confusion_matrix
import xgboost as xgb

In [None]:
data_test=pd.read_csv("data/test.csv")
data_train=pd.read_csv("data/train.csv")

In [None]:
X_test = data_test.copy()

In [None]:
data_train.columns

In [None]:
numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard','IsActiveMember', 'EstimatedSalary', 'Exited']

In [None]:
data_train.dtypes

In [None]:
columns_of_interest = ['Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Age']

# Get unique values for each specified column
for column in columns_of_interest:
    unique_values = data_train[column].unique()
    print(f"Unique values in '{column}': {unique_values}")

In [None]:
#Enlever les variables qui ne sont pas nécessaires
data_train = data_train.drop(columns=["id","CustomerId", "Surname"])


In [None]:
X_test = X_test.drop(columns=["id","CustomerId", "Surname"])

# Correlation

In [None]:
corr_matrix = data_train[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, cmap='Greens',linewidths=1,mask=np.triu(corr_matrix),fmt = '.2f', annot=True)
plt.title('Matrice de Corrélation')
plt.show()

In [None]:
# les variables les plus corrélées avec une variable cible
top_corr_with_target = corr_matrix["Exited"].sort_values(ascending=False)
top_corr_with_target= pd.DataFrame(top_corr_with_target)
top_corr_with_target[1:]

# Doublons

In [None]:
# Vérifier s'il y a des doublons
doublons = data_train.duplicated()
nb_doublons = doublons.sum()
print(f"Nombre de doublons : {nb_doublons}")

# NA

In [None]:
has_na = data_train.isna().any().any()
print("Any NA values in the dataset:", has_na)

Nous divisons notre base de données en ensembles d'entraînement (train) et de test avant de réaliser toutes les manipulations, afin d'éviter de biaiser les résultats sur l'ensemble de test.
(page 274 cours ML: Model Selection (hold out a validation set))

In [None]:
train_selection, test_selection = train_test_split(data_train, test_size=0.2, random_state=0)

# Outliers

In [None]:
#¨Pour set d'apprentissage
for col in numeric_cols:

    Q1 = train_selection[col].quantile(0.25)
    Q3 = train_selection[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    number_outliers = train_selection[(train_selection[col] < lower_bound) | (train_selection[col] > upper_bound)].shape[0]

    print("Le nombre des outliers de la variable", col, "est", number_outliers)


In [None]:
sns.boxplot(x="Exited", y = "CreditScore", data=train_selection)
plt.title('CreditScore')

In [None]:
Q1 = train_selection["CreditScore"].quantile(0.25)
Q3 = train_selection["CreditScore"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = train_selection[(train_selection["CreditScore"] < lower_bound) | (train_selection["CreditScore"] > upper_bound)]
print(f"Outliers for CreditScore:")
print(outliers[["CreditScore"]])  # Print only the specific column for clarity
print("\n" + "-" * 50 + "\n")  # Separator for readability

Given that credit scores typically don't exceed 850, the value 4818 is clearly an anomaly => point abberant.

Scores in the 431–439 range are typically valid but represent very poor creditworthiness. They should usually be kept in the dataset unless there is a specific reason to filter them out.

In [None]:
# suppression des valeurs aberrantes pour CreditScore
train_selection = train_selection[train_selection['CreditScore'] <= 1000]

In [None]:
#For test set
X_test = X_test[X_test['CreditScore'] <= 1000]

In [None]:
#¨Pour set de validation
Q1 = test_selection["CreditScore"].quantile(0.25)
Q3 = test_selection["CreditScore"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = test_selection[(test_selection["CreditScore"] < lower_bound) | (test_selection["CreditScore"] > upper_bound)]
print(f"Outliers for CreditScore:")
print(outliers[["CreditScore"]])  # Print only the specific column for clarity
print("\n" + "-" * 50 + "\n")  # Separator for readability

#On voit pas de probleme ici

# Encoding

In [None]:
# Encodez les variables catégorielles en utilisant One-Hot Encoding
categorical_columns = [col for col in ['Geography', 'Gender'] if col in train_selection.columns]
train_selection = pd.get_dummies(train_selection, columns=categorical_columns, drop_first=True)
test_selection = pd.get_dummies(test_selection, columns=categorical_columns, drop_first=True)

binary_columns = ['Geography_Germany','Geography_Spain', 'Gender_Male']
train_selection[binary_columns] = train_selection[binary_columns].astype(int)
test_selection[binary_columns] = test_selection[binary_columns].astype(int)

In [None]:
# Encodez les variables catégorielles en utilisant One-Hot Encoding
categorical_columns = [col for col in ['Geography', 'Gender'] if col in X_test.columns]
X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

X_test[binary_columns] = X_test[binary_columns].astype(int)


In [None]:
X_train = train_selection.drop(columns=['Exited'])  # Toutes les colonnes sauf 'Exited' pour les variables explicatives
y_train = train_selection['Exited']                 # Cible (1 = churn, 0 = non-churn)

X_val = test_selection.drop(columns=['Exited'])  # Toutes les colonnes sauf 'Exited' pour les variables explicatives
y_val = test_selection['Exited'] 


# Normalisation

In [None]:
from sklearn.preprocessing import RobustScaler

continuous_columns = (X_train.drop(columns=['Geography_Germany','Geography_Spain', 'Gender_Male'])).columns
binary_columns = ['Geography_Germany','Geography_Spain', 'Gender_Male']

scaler = RobustScaler()

# On applique le scaler aux colonnes continues de X_train et X_test
X_train_scaled = scaler.fit_transform(X_train[continuous_columns])
X_val_scaled = scaler.transform(X_val[continuous_columns])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=continuous_columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=continuous_columns, index=X_val.index)

# On concatène les colonnes binaires avec les colonnes continues mises à l'échelle
X_train_scaled = pd.concat([X_train_scaled, X_train[binary_columns]], axis=1)
X_val_scaled = pd.concat([X_val_scaled, X_val[binary_columns]], axis=1)


In [None]:
continuous_columns = (X_test.drop(columns=['Geography_Germany','Geography_Spain', 'Gender_Male'])).columns
binary_columns = ['Geography_Germany','Geography_Spain', 'Gender_Male']

scaler = RobustScaler()

# On applique le scaler aux colonnes continues de X_train et X_test
X_test_scaled = scaler.fit_transform(X_test[continuous_columns])

X_test_scaled = pd.DataFrame(X_test_scaled, columns=continuous_columns, index=X_test.index)

# On concatène les colonnes binaires avec les colonnes continues mises à l'échelle
X_test_scaled = pd.concat([X_test_scaled, X_test[binary_columns]], axis=1)


In [None]:
X_train_scaled.dtypes

# Feature selection

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calcul_vif(X):
    """
    Calcul du VIF (Variance Inflation Factor) pour chaque variable dans X.
    """
    donnees_vif = pd.DataFrame()
    donnees_vif["feature"] = X.columns
    donnees_vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return donnees_vif

def calcul_correlation(X, y):
    """
    Calcul de la corrélation entre chaque variable de X et la variable cible y.
    """
    donnees_correlation = X.apply(lambda x: x.corr(y))
    donnees_correlation = donnees_correlation.reset_index()
    donnees_correlation.columns = ["feature", "correlation"]
    return donnees_correlation

def remove_highvif_lowcorrelation(X, y, seuil_vif=5):
    """
    Suppression des variables avec un VIF élevé (VIF>5) et une corrélation faible avec la cible y 
    jusqu'à ce que toutes les variables restantes aient un VIF inférieur au seuil.
    """
    # Calcul du VIF et de la corrélation
    donnees_vif = calcul_vif(X)
    donnees_correlation = calcul_correlation(X, y)
    
    # Fusionner les résultats de VIF et des corrélations
    donnees_combinees = pd.merge(donnees_vif, donnees_correlation, on="feature")

    while donnees_combinees['VIF'].max() > seuil_vif:
        # Trier par VIF élevé et corrélation faible
        donnees_combinees = donnees_combinees.sort_values(by=['VIF', 'correlation'], ascending=[False, True])
        var_a_eliminer = donnees_combinees.iloc[0]['feature']  # La variable à supprimer
        print(f"On élimine '{var_a_eliminer}' avec un VIF de: {donnees_combinees.iloc[0]['VIF']} et une corrélation de: {donnees_combinees.iloc[0]['correlation']}")
        
        # Suppression de la variable dans X
        X = X.drop(columns=[var_a_eliminer])

        # Recalculer VIF et corrélation après suppression
        donnees_vif = calcul_vif(X)
        donnees_correlation = calcul_correlation(X, y)
        donnees_combinees = pd.merge(donnees_vif, donnees_correlation, on="feature") 

    return X, donnees_combinees

X_train_scaled, vif_correlation_data = remove_highvif_lowcorrelation(X_train_scaled, y_train, seuil_vif=5)



In [None]:
X_val_scaled=X_val_scaled[X_train_scaled.columns]

0.9318

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Parameter grid for CatBoost
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bylevel': [0.8, 1.0],
    'scale_pos_weight': [1, 5, 10],
    'l2_leaf_reg': [1, 3, 5, 10],
    'bagging_temperature': [0.5, 1, 2, 5]
}

# Cross-validation settings
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# CatBoost model
model = CatBoostClassifier(boosting_type='Plain', eval_metric='AUC', verbose=0)

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Best model and evaluation
best_model = grid_search.best_estimator_
y_pred_proba_best = best_model.predict_proba(X_val)[:, 1]
auc_best = roc_auc_score(y_val, y_pred_proba_best)
print("Best AUC after hyperparameter tuning:", auc_best)

In [None]:
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Créez sample_submission avec 'id' et 'Churn_Probability'
sample_submission_test_N = data_test[['id']].copy()  # Copie uniquement la colonne 'id' de data_test
sample_submission_test_N["Exited"] = y_pred_proba_best  # Ajout des probabilités de churn

# Optionnel : Exportez sample_submission en CSV
sample_submission_test_N.to_csv("sample_submission_test_catboost.csv", index=False)
