# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# ### graphical plotly basics
# import matplotlib.pyplot as plt
import scikitplot as skplt
# import plotly.graph_objects as go
# import plotly.express as px
# # for jupyter notebook display management
# import plotly.io as pio
# pio.renderers.default = "notebook"

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_churn_raw = dfc.load_dataset_from_config('churn_data', sep=',')

if df_churn_raw is not None and isinstance(df_churn_raw, pd.DataFrame):
    display(df_churn_raw.head())
    dfc.log_general_info(df_churn_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_churn_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_churn_raw))
    df_churn = dfc.normalize_column_names(df_churn_raw)
    display(df_churn.head())

In [None]:
df_churn_desc = df_churn.select_dtypes(include=np.number).describe()
display(df_churn_desc)
df_churn_cr = df_churn.select_dtypes(include=np.number).corr()
display(df_churn_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and dupplicates management
df_churn_orig = df_churn.copy()
df_churn = df_churn.drop_duplicates()

In [None]:
dfc.display_variable_info(df_churn.churn)
df_churn.churn = df_churn.churn.apply(lambda x: True if x=='True.' else False).astype(bool)

In [None]:
df_churn = df_churn.join(pd.get_dummies(df_churn['int_l_plan'], prefix='international'))
df_churn = df_churn.join(pd.get_dummies(df_churn['vmail_plan'], prefix='voicemail'))

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_churn.drop(['int_l_plan', 'vmail_plan', 'state','area_code', 'phone','churn'], axis=1)
target = df_churn['churn']

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=12)

## 2.2 Random Forest Tree
- Bagging on Decision Tree with random factor on variables at two levels
    - Data selected (rows)
    - Variable selected (columns)
- reduces variance
- efficient with high number of dimensions
- robust with outliers
- avoid over fitting in general, do not need out of bag scoring

In [None]:
# Definition et Entrainement du modèle
clfRFC = RandomForestClassifier(n_jobs=-1, random_state=321)
clfRFC.fit(X_train, y_train)

In [None]:
# Prédiction du modèle sur les données de test
y_pred = clfRFC.predict(X_test)

In [None]:
# Matrice de confusion et évaluation du modèle
cm = confusion_matrix(y_test,y_pred)
print(cm)
df_cm = pd.crosstab(y_test.to_numpy().ravel(), y_pred, rownames=['real'], colnames=['predicted'])
display(df_cm)

score = sum(cm[i][i] for i in range(0, cm.shape[0]))/cm.sum()
print("Score reconstruit manuellement:",score)
print("Score calculé par le modèle:", clfRFC.score(X_test, y_test))
print("Rapport de classification complet:\n", classification_report(y_test, y_pred))

In [None]:
# [Optionel] Récupération des probabilités (et remaniement des règles d'identification de classe : par défaut la classe prédite est celle avec la proba la plus elevée)
y_probs = clfRFC.predict_proba(X_test)
print(y_probs)
skplt.metrics.plot_cumulative_gain(y_test,y_probs,figsize=(12,8))