In [1]:
# importer les packages
import pandas as pd
import numpy as np
import random
# import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.graph_objs as gbs
import plotly.express as px

import pickle as pck

In [2]:
# lire la base de donnée
df  = pd.read_csv("/content/train_u6lujuX_CVtuZ9i.csv")

In [3]:
df["Loan_Status"].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [4]:
# afficher toute les lignes du Dataset
pd.set_option("display.max_rows", df.shape[0] + 1)

In [5]:
# Afficher les 5 premieres et les 5 dernieres lignes
pd.set_option("display.max_rows",10)
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
# Visualiser les valeurs manquantes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
# Une autre façon pour bien visualiser les valeurs manquantes
df.isnull().sum().sort_values(ascending=False)

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
                     ..
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
Length: 13, dtype: int64

On remarque qu'il y'a pas mal de valeurs manquantes dans notre dataset
Exemple : 50 sur la colonnes de l'historiques des crédits

In [8]:
# Une déscription statistique sur les variables numériques
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [9]:
# Une déscription statistique sur les variables catégoriques
df.describe(include='O')

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,601,611,599,614,582,614,614
unique,614,2,2,4,2,2,3,2
top,LP001002,Male,Yes,0,Graduate,No,Semiurban,Y
freq,1,489,398,345,480,500,233,422


In [10]:
# Renseigner les valeurs manquantes

# On va appliquer deux méthodes différentes pour renseigner
# les valeurs manquantes selon le type des variables (Numériques et Catégoriques)
data_cat_var = []
data_num_var = []
for i,c in enumerate(df.dtypes):
  if c == object:
    data_cat_var.append(df.iloc[:, i])
  else:
    data_num_var.append(df.iloc[:, i])

# Transformer les listes en dataframes
data_cat_var = pd.DataFrame(data_cat_var).transpose()
data_num_var = pd.DataFrame(data_num_var).transpose()

In [11]:
# Le dataframe des variables numériques
data_num_var

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849.0,0.0,,360.0,1.0
1,4583.0,1508.0,128.0,360.0,1.0
2,3000.0,0.0,66.0,360.0,1.0
3,2583.0,2358.0,120.0,360.0,1.0
4,6000.0,0.0,141.0,360.0,1.0
...,...,...,...,...,...
609,2900.0,0.0,71.0,360.0,1.0
610,4106.0,0.0,40.0,180.0,1.0
611,8072.0,240.0,253.0,360.0,1.0
612,7583.0,0.0,187.0,360.0,1.0


In [12]:
# On va remplacer les valeures manquantes des variables
# catégoriques par la valeurs qui a une fréquence plus grande
data_cat_var = data_cat_var.apply(lambda x:x.fillna(x.value_counts().index[0]))
data_cat_var.isnull().any()

Loan_ID          False
Gender           False
Married          False
Dependents       False
Education        False
Self_Employed    False
Property_Area    False
Loan_Status      False
dtype: bool

In [13]:
data_num_var1 = data_num_var.copy()

In [14]:
# On va remplacer les valeures manquantes des variables
# numériques par la moyennes des valeurs
data_num_var = data_num_var.apply(lambda x:x.fillna(x.mean()))
data_num_var.isnull().sum().any()

False

In [15]:
# On va remplacer les valeures manquantes des variables
# numériques par la valeurs qui la précéde des valeurs
data_num_var1.fillna(method='bfill', inplace=True)
data_num_var1.isnull().sum().any()

False

In [16]:
# transformer la colone target
label_value = {'Y':1, 'N':0}
label = data_cat_var['Loan_Status']
data_cat_var.drop('Loan_Status', axis=1, inplace=True)
label = label.map(label_value)

In [17]:
# Effectuer la meme chose sur toute les variables catégoriques
lbl = LabelEncoder()
for i in data_cat_var:
  data_cat_var[i] = lbl.fit_transform(data_cat_var[i])
data_cat_var

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,0,1,0,0,0,0,2
1,1,1,1,1,0,0,0
2,2,1,1,0,0,1,2
3,3,1,1,0,1,0,2
4,4,1,0,0,0,0,2
...,...,...,...,...,...,...,...
609,609,0,0,0,0,0,0
610,610,1,1,3,0,0,0
611,611,1,1,1,0,0,2
612,612,1,1,2,0,0,2


In [18]:
# Suprimmer la colonne de Loan_ID
data_cat_var.drop('Loan_ID', axis=1, inplace=True)

In [19]:
data_cat_var

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,1,0,0,0,0,2
1,1,1,1,0,0,0
2,1,1,0,0,1,2
3,1,1,0,1,0,2
4,1,0,0,0,0,2
...,...,...,...,...,...,...
609,0,0,0,0,0,0
610,1,1,3,0,0,0
611,1,1,1,0,0,2
612,1,1,2,0,0,2


In [20]:
# Concatener data_cat_var et data_num_var
X = pd.concat([data_cat_var, data_num_var], axis=1)
y = label

In [21]:
X1 = pd.concat([data_cat_var, data_num_var1], axis=1)

In [22]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,2,5849.0,0.0,146.412162,360.0,1.0
1,1,1,1,0,0,0,4583.0,1508.0,128.000000,360.0,1.0
2,1,1,0,0,1,2,3000.0,0.0,66.000000,360.0,1.0
3,1,1,0,1,0,2,2583.0,2358.0,120.000000,360.0,1.0
4,1,0,0,0,0,2,6000.0,0.0,141.000000,360.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,2900.0,0.0,71.000000,360.0,1.0
610,1,1,3,0,0,0,4106.0,0.0,40.000000,180.0,1.0
611,1,1,1,0,0,2,8072.0,240.0,253.000000,360.0,1.0
612,1,1,2,0,0,2,7583.0,0.0,187.000000,360.0,1.0


In [23]:
X1

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,2,5849.0,0.0,128.0,360.0,1.0
1,1,1,1,0,0,0,4583.0,1508.0,128.0,360.0,1.0
2,1,1,0,0,1,2,3000.0,0.0,66.0,360.0,1.0
3,1,1,0,1,0,2,2583.0,2358.0,120.0,360.0,1.0
4,1,0,0,0,0,2,6000.0,0.0,141.0,360.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,2900.0,0.0,71.0,360.0,1.0
610,1,1,3,0,0,0,4106.0,0.0,40.0,180.0,1.0
611,1,1,1,0,0,2,8072.0,240.0,253.0,360.0,1.0
612,1,1,2,0,0,2,7583.0,0.0,187.0,360.0,1.0


In [24]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

In [25]:
# lire les données de test
#df_test  = pd.read_csv("/content/test_Y3wMUE5_7gLdaTN.csv")
#df_test

In [26]:
# Loan_status & Property_Area
trace = gbs.Bar(x=data_cat_var['Property_Area'], y=y)
layout = gbs.Layout(title={
        'text': 'Loan_status/Property_Area',
        'x': 0.5  # Position horizontale centrée (50%)
    }, xaxis=dict(title='Property_Area'), yaxis=dict(title='Loan_status'))

fig = gbs.Figure(data=[trace], layout=layout)
fig.show()


In [27]:
# Loan_status & Gender
trace1 = gbs.Bar(x=data_cat_var['Gender'], y=y)
layout1 = gbs.Layout(title={
        'text': 'Loan_status/Gender',
        'x': 0.5  # Position horizontale centrée (50%)
    },xaxis=dict(title='Gender'), yaxis=dict(title='Loan_status'))

fig1 = gbs.Figure(data=[trace1], layout=layout1)
fig1.show()


#### Ici on va appliquer trois algorithme de ML qui sont :
#### -> Logistic Regression
#### -> KNN algorithm
#### -> DescisionTree algorithm

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

In [29]:
# Diviser les données en deux parties une pour les données de test et l'autre pour les donnéees d'entrainement
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train, test in sss.split(X,y):
  X_train, X_test = X.iloc[train], X.iloc[test]
  y_train, y_test = y.iloc[train], y.iloc[test]
print("X_train taille : ", X_train.shape)
print("X_test taille : ", X_test.shape)
print("y_train taille : ", y_train.shape)
print("y_test taille : ", y_test.shape)

X_train taille :  (491, 11)
X_test taille :  (123, 11)
y_train taille :  (491,)
y_test taille :  (123,)


In [30]:
from pandas.core.common import random_state
models = {'LogisticRegression': LogisticRegression(random_state=42),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=1, random_state=42)}


In [31]:
# Définir la fonction cout
def accu(y_init, y_predit, retu=False):
  acc = accuracy_score(y_init, y_predit)
  if retu:
    return acc
  else:
    print(f'La précision du modèle est : {acc}')

In [32]:
# Entrainement des modèles
def train_test_evaluation(models, X_train, y_train, X_test, y_test):
  for name,model in models.items():
    print(name,':')
    model.fit(X_train, y_train)
    accu(y_test, model.predict(X_test))
    print('-'*30)
train_test_evaluation(models, X_train, y_train, X_test, y_test)

LogisticRegression :
La précision du modèle est : 0.8617886178861789
------------------------------
KNeighborsClassifier :
La précision du modèle est : 0.6504065040650406
------------------------------
DecisionTreeClassifier :
La précision du modèle est : 0.8536585365853658
------------------------------



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [33]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [34]:
# Définir le modèle de réseaux de neuronne
modele = keras.Sequential([
    layers.InputLayer(input_shape=(11,)),
    layers.Dense(100, activation='sigmoid'),
    layers.Dense(1, activation='relu')
                           ])

In [35]:
modele.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               1200      
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,301
Trainable params: 1,301
Non-trainable params: 0
_________________________________________________________________


In [36]:
#opt = keras.optimizers.Adam(learning_rate=0.01)
opt = keras.optimizers.SGD(learning_rate=0.01)

modele.compile(optimizer=opt, loss="mse")

In [37]:
from re import VERBOSE
batch_size = 64
epochs = 10
data_train, data_validation, target_train, target_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
history = modele.fit(data_train, target_train,
                     epochs = epochs,
                     batch_size = batch_size,
                     verbose = 1,
                     validation_data = (data_validation,target_validation)
                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
X_2 = X[['Credit_History', 'Gender', 'Married', 'CoapplicantIncome', 'ApplicantIncome']]

In [39]:
X_2.head()

Unnamed: 0,Credit_History,Gender,Married,CoapplicantIncome,ApplicantIncome
0,1.0,1,0,0.0,5849.0
1,1.0,1,1,1508.0,4583.0
2,1.0,1,1,0.0,3000.0
3,1.0,1,1,2358.0,2583.0
4,1.0,1,0,0.0,6000.0


In [40]:
# Diviser les données en deux parties une pour les données de test et l'autre pour les donnéees d'entrainement
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train, test in sss.split(X_2,y):
  X_train1, X_test1 = X_2.iloc[train], X_2.iloc[test]
  y_train1, y_test1 = y.iloc[train], y.iloc[test]
print("X_train taille : ", X_train.shape)
print("X_test taille : ", X_test.shape)
print("y_train taille : ", y_train.shape)
print("y_test taille : ", y_test.shape)
train_test_evaluation(models, X_train1, y_train1, X_test1, y_test1)

X_train taille :  (491, 11)
X_test taille :  (123, 11)
y_train taille :  (491,)
y_test taille :  (123,)
LogisticRegression :
La précision du modèle est : 0.8617886178861789
------------------------------
KNeighborsClassifier :
La précision du modèle est : 0.6585365853658537
------------------------------
DecisionTreeClassifier :
La précision du modèle est : 0.8536585365853658
------------------------------


In [41]:
# On remarque que le modèle de la regression logistic est celui qui donne des résultats plus performantes donc on va l'appliquer tout seul a notre nouvelle base de données
Classifier = LogisticRegression()
Classifier.fit(X_2, y)

In [42]:
# Enregistrement du modele en utilisant pickle
pck.dump(Classifier, open('model.pkl', 'wb'))