In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [47]:
projet = pd.read_csv('creditBanc.csv')

In [48]:
projet.head(100)

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,LP001326,Male,No,0.0,Graduate,,6782,0.0,,360.0,,Urban,N
96,LP001327,Female,Yes,0.0,Graduate,No,2484,2302.0,137.0,360.0,1.0,Semiurban,Y
97,LP001333,Male,Yes,0.0,Graduate,No,1977,997.0,50.0,360.0,1.0,Semiurban,Y
98,LP001334,Male,Yes,0.0,Not Graduate,No,4188,0.0,115.0,180.0,1.0,Semiurban,Y


# Identification des colonnes ayant des valeurs manquantes

In [49]:
projet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Client_ID                614 non-null    object 
 1   Sexe                     601 non-null    object 
 2   Situation_familiale      611 non-null    object 
 3   Personnes_charge         599 non-null    float64
 4   Diplome                  614 non-null    object 
 5   Travailleur_independant  582 non-null    object 
 6   Revenu_imposable         614 non-null    int64  
 7   Revenu_forfitaire        614 non-null    float64
 8   Montant_pret             592 non-null    float64
 9   Duree_pret               600 non-null    float64
 10  Hitorique_credit         564 non-null    float64
 11  Residence                614 non-null    object 
 12  Decision                 614 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 62.5+ KB


# Obtenir une liste des colonnes avec au moins une donnée manquante

In [50]:
projet.columns[projet.isnull().any()]

Index(['Sexe', 'Situation_familiale', 'Personnes_charge',
       'Travailleur_independant', 'Montant_pret', 'Duree_pret',
       'Hitorique_credit'],
      dtype='object')

# Nombre de données manquantes par collone

In [51]:
projet.isnull().sum()

Client_ID                   0
Sexe                       13
Situation_familiale         3
Personnes_charge           15
Diplome                     0
Travailleur_independant    32
Revenu_imposable            0
Revenu_forfitaire           0
Montant_pret               22
Duree_pret                 14
Hitorique_credit           50
Residence                   0
Decision                    0
dtype: int64

# Supprimer les lignes avec des valeurs manquantes dans les colonnes de variables qualitatives 

In [52]:
projet.dropna(subset = ['Sexe', 'Situation_familiale', 'Travailleur_independant'], inplace = True)

In [53]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0.0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3.0,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1.0,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2.0,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# Remplacer les valeurs manquantes des colonnes numériques par la valeur précedente de la même colonne

In [54]:
projet.fillna(method = 'bfill', inplace = True)

  projet.fillna(method = 'bfill', inplace = True)


In [55]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0.0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3.0,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1.0,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2.0,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [56]:
projet.isnull().sum().any()

False

# Remplacement des mots "Male" et "Female" respectivement par 1 et 0

In [57]:
projet['Sexe'].replace(['Male','Female'], [1,0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Sexe'].replace(['Male','Female'], [1,0], inplace = True)
  projet['Sexe'].replace(['Male','Female'], [1,0], inplace = True)


In [58]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,1,No,0.0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,1,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,1,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,No,0.0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,1,Yes,3.0,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,1,Yes,1.0,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,1,Yes,2.0,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# Remplacement des mots "Yes" et "No" dans les colonnes "Situation_familiale" et "Travailleur_independant", respectivement par 1 et 0

In [59]:
projet['Situation_familiale'].replace(['Yes','No'], [1,0], inplace = True)
projet['Travailleur_independant'].replace(['Yes','No'], [1,0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Situation_familiale'].replace(['Yes','No'], [1,0], inplace = True)
  projet['Situation_familiale'].replace(['Yes','No'], [1,0], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Travailleur_independant'].replace(['Yes','No'], [1,0], inplace = True)
 

In [60]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,1,0,0.0,Graduate,0,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,1,1,1.0,Graduate,0,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,1,0.0,Graduate,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,1,0.0,Not Graduate,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,1,0,0.0,Graduate,0,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0.0,Graduate,0,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,1,1,3.0,Graduate,0,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,1,1,1.0,Graduate,0,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,1,1,2.0,Graduate,0,7583,0.0,187.0,360.0,1.0,Urban,Y


# Remplacement des mots "Graduate" et "Not Graduate" de la colonne Diplome respectivement par 1 et 0

In [61]:
projet['Diplome'].replace(['Graduate','Not Graduate'], [1,0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Diplome'].replace(['Graduate','Not Graduate'], [1,0], inplace = True)
  projet['Diplome'].replace(['Graduate','Not Graduate'], [1,0], inplace = True)


In [17]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,Urban,Y


# Remplacement des mots "Urban", "Semiurban" et "Rural" de la colonne Residence respectivement par 2,1 et 0

In [62]:
projet['Residence'].replace(['Urban','Semiurban','Rural'], [2,1,0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Residence'].replace(['Urban','Semiurban','Rural'], [2,1,0], inplace = True)
  projet['Residence'].replace(['Urban','Semiurban','Rural'], [2,1,0], inplace = True)


In [19]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,2,Y
1,LP001003,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,2,Y
3,LP001006,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,LP001008,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,0,Y
610,LP002979,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,0,Y
611,LP002983,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,2,Y
612,LP002984,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,2,Y


# Repmlacement de "Y" et "N" de la colonne Decision respectivement par 1 et 0

In [63]:
projet['Decision'].replace(['Y','N'], [1,0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  projet['Decision'].replace(['Y','N'], [1,0], inplace = True)
  projet['Decision'].replace(['Y','N'], [1,0], inplace = True)


In [21]:
projet

Unnamed: 0,Client_ID,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,LP001002,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,2,1
1,LP001003,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,LP002979,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,LP002983,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,LP002984,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,2,1


# Suppression de la colonne "Client_ID"

In [64]:
projet.drop('Client_ID', axis=1, inplace = True)

In [65]:
projet

Unnamed: 0,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence,Decision
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,2,1


# Selection des colonnes des variables explicatives et de la variable à expliquer (à prédire)

In [66]:
projet.columns

Index(['Sexe', 'Situation_familiale', 'Personnes_charge', 'Diplome',
       'Travailleur_independant', 'Revenu_imposable', 'Revenu_forfitaire',
       'Montant_pret', 'Duree_pret', 'Hitorique_credit', 'Residence',
       'Decision'],
      dtype='object')

In [67]:
X = projet[['Sexe', 'Situation_familiale', 'Personnes_charge', 'Diplome',
       'Travailleur_independant', 'Revenu_imposable', 'Revenu_forfitaire',
       'Montant_pret', 'Duree_pret', 'Hitorique_credit', 'Residence']]

In [68]:
X.head(50)

Unnamed: 0,Sexe,Situation_familiale,Personnes_charge,Diplome,Travailleur_independant,Revenu_imposable,Revenu_forfitaire,Montant_pret,Duree_pret,Hitorique_credit,Residence
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,2
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2.0,1,1,5417,4196.0,267.0,360.0,1.0,2
6,1,1,0.0,0,0,2333,1516.0,95.0,360.0,1.0,2
7,1,1,3.0,1,0,3036,2504.0,158.0,360.0,0.0,1
8,1,1,2.0,1,0,4006,1526.0,168.0,360.0,1.0,2
9,1,1,1.0,1,0,12841,10968.0,349.0,360.0,1.0,1


In [69]:
y = projet['Decision']

In [70]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Decision, Length: 566, dtype: int64

# Décomposition des données en données d'entrainement et données de test 

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 101)
print('Données d\'entrainement:',X_train.shape )
print('Données de test:',X_test.shape)

Données d'entrainement: (424, 11)
Données de test: (142, 11)


# Création du modèle (instanciation) de type KNeighborsClassifier

In [72]:
model = KNeighborsClassifier()

In [73]:
model.fit(X_train, y_train)

# Prédiction

In [74]:
predictions = model.predict(X_test)

# Performance du modèle sur la base des données d'entrainement

In [75]:
model.score(X_train, y_train)

0.7405660377358491

# Performance du modèle sur la base des données de test

In [76]:
model.score(X_test, y_test)

0.6690140845070423

# Optimisation du modèle grace à GridSearchCV

In [77]:
param_grid = {'n_neighbors':np.arange(1,100),
              'weights' : ['uniform', 'distance'],
              'metric' : ['euclidean', 'manhattan', 'minkowski']}

In [78]:
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 10)

In [79]:
grid.fit(X_train, y_train)

# Meilleur score

In [80]:
grid.best_score_

0.6957364341085273

# Mailleurs paramètres

In [81]:
grid.best_params_

{'metric': 'euclidean', 'n_neighbors': 27, 'weights': 'uniform'}

# Sauvgarde du modèle

In [82]:
model = grid.best_estimator_

In [83]:
model

In [84]:
model.score(X_test, y_test)

0.6549295774647887

In [85]:
model.predict([[1,1,2,1,0,7800,2300,350,480,1,2]])



array([1], dtype=int64)