In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import row
output_notebook()
import seaborn as sns

from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [2]:
def traitement_types(df):
    df_ = df.copy().replace('\\N',np.NAN)
    
    for c in df_.columns:
        if 'crd_' in c or c in ['CRD','IMPAYES_DEBUT','age','adulte_foyer']:
            df_[c] = pd.to_numeric(df_[c], errors='coerce')
        
    df_.moy_eco_jour = df_.moy_eco_jour.str.replace(',', '.', regex=False).astype(float)
    df_.RAV_UC = df_.RAV_UC.str.replace(',', '.', regex=False).astype(float)
    
    df_['Date'] = pd.to_datetime(df_.year.astype(str) + '-' + df_.month.astype(str), format='%Y-%m')
    df_ = df_.drop(columns=['year','month'])
    
    return df_


In [3]:
def trancheAge(x):
    age = x.age
    if age < 25:
        return '<25ans'

    elif age >= 25 and age <= 34:
        return  '25-34ans'

    elif age >= 35 and age <= 44:
        return  '35-44ans'

    elif age >= 45 and age <= 54:
        return  '45-54ans'

    elif age >= 55 and age <= 64:
        return  '55-64ans'

    elif age >= 65 and age <= 74:
        return  '65-74ans'

    elif age >= 75:
        return '>75ans'
    
    
def traitement_na(df):
    df_ = df.copy()
    
    df_ = df_.replace('Non Renseigne', np.NAN)
    for CRD in df_.columns.tolist():
        if 'crd_' in CRD:
            df_[CRD] = df_[CRD].fillna(0)
    
    for p in df_.PROF.unique():
        med = df_.loc[df_.PROF == p].age.median()
        df_.loc[df_.PROF == p, 'age'] = df_.loc[df_.PROF == p, 'age'].fillna(med)
    
    df_.tranche_age = df_.apply(lambda x: trancheAge(x), axis=1)
    
    df_ = df_.drop(columns=['IMPAYES_DEBUT'])
    
    df_.gain_mediation = df_.gain_mediation.fillna(0).astype(int)
    
    df_.cat_impayes = df_.cat_impayes.fillna('Inconnu')
    
    df_ = df_.dropna()
    
    return df_

In [4]:
def mega_traitement(df,dummies=False):
    df_ = df.copy()
    df_ = traitement_types(df_)
    df_ = traitement_na(df_)

    cols_categorielles = list(df_.dtypes[df_.dtypes == 'object'].to_dict().keys())

    for cc in cols_categorielles:
        cat_dtype = pd.api.types.CategoricalDtype(categories=df_[cc].unique().tolist(), ordered=False)
        df_[cc] = df_[cc].astype(cat_dtype)
    
    if dummies:
        or_dummies = pd.get_dummies(df_.ORIENTATION)
        df_[df_.ORIENTATION.unique().tolist()]=or_dummies
    
    return df_

In [13]:
train = pd.read_csv('data/train.csv')
train = mega_traitement(train,dummies=False)

test = pd.read_csv('data/test.csv')
test = mega_traitement(test,dummies=False)

In [14]:
test.isna().sum()

id                         0
STRUCTURE PRESCRIPTRICE    0
PLATEFORME                 0
region                     0
NATURE_DIFF                0
age                        0
tranche_age                0
situation                  0
adulte_foyer               0
pers_a_charge              0
PROF                       0
LOGEMENT                   0
REVENUS                    0
cat_rev                    0
CHARGES                    0
cat_charges                0
CREDITS                    0
cat_credit                 0
cat_impayes                0
RAV_ouverture              0
cat_RAV_ouverture          0
RAV_UC                     0
cat_RAV_UC                 0
nb_credits                 0
CRD                        0
gain_mediation             0
nb_amort                   0
crd_amort                  0
nb_renouv                  0
crd_renouv                 0
nb_immo                    0
crd_immo                   0
nb_rac                     0
crd_rac                    0
nb_autres     

# I - Edit train

Nous pouvons définir les types d'orientation comme colonne afin de distinguer (dans la corrélation), quelle features influe sur chaque type d'Orientation.

In [6]:
train.ORIENTATION.value_counts()

Surendettement                   1525
Accompagnement                   1491
Aucune                            320
Mediation                         311
Autres Procédures Collectives      26
Microcredit                         4
Name: ORIENTATION, dtype: int64

In [7]:
def expelliarmus(df,toDrop,dummies=False):

    cols_categorielles = list(df.dtypes[df.dtypes == 'category'].to_dict().keys())
    features = df.drop(columns=cols_categorielles+['Date']).drop(columns=['id'])
    
    features = features.drop(columns=toDrop)
    
    sc = StandardScaler() 
    Z = sc.fit_transform(features)
    acp = PCA(svd_solver='full')
    coord = acp.fit_transform(Z) 
    
    points = pd.DataFrame(coord)
    
    return points

In [8]:
TO_DROP = [c for c in train.columns if 'crd_' in c]

mf_train = expelliarmus(train,dummies=False, toDrop=TO_DROP)
mf_test = expelliarmus(test,dummies=False, toDrop=TO_DROP)




  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(mf_train)
mf_train['kmeans'] = kmeans.labels_

mf_test['kmeans'] = kmeans.predict(mf_test)

In [9]:
train = train.set_index(train.index)
train['groupe'] = mf_train['kmeans']

test = test.set_index(test.index)
test['groupe'] = mf_test['kmeans']

In [15]:
#for gg in train.groupe.unique():
#    print(f'#> Groupe {gg}\n')
#    fig, ax = plt.subplots()
#    ax.set_xlim([0,3000])
#    train.loc[train.groupe == gg,'ORIENTATION'].value_counts().plot(kind='barh')
#    plt.show()
#    print('\n')