# Projet DataBeez

In [257]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

In [258]:
data = pd.read_csv('german_credit_data.csv')

In [259]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [260]:
data.shape

(1000, 11)

In [261]:
(data.isnull().sum()/data.shape[0])*100

Unnamed: 0           0.0
Age                  0.0
Sex                  0.0
Job                  0.0
Housing              0.0
Saving accounts     18.3
Checking account    39.4
Credit amount        0.0
Duration             0.0
Purpose              0.0
Risk                 0.0
dtype: float64

In [262]:
data.drop(data.columns[0],axis=1,inplace=  True) # suppression de la colonne Unnamed:0

In [263]:
data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [264]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [265]:
data['Age'].dtype

dtype('int64')

In [266]:
liste1  = [col for col in data.columns if data[col].dtypes == 'object']
dictionaire = {}
for dict in liste1:
    dictionaire[dict] = data[dict].values
#print(dictionaire)

Variables_categoriques = pd.DataFrame(dictionaire)

In [267]:
Variables_categoriques

Unnamed: 0,Sex,Housing,Saving accounts,Checking account,Purpose,Risk
0,male,own,,little,radio/TV,good
1,female,own,little,moderate,radio/TV,bad
2,male,own,little,,education,good
3,male,free,little,little,furniture/equipment,good
4,male,free,little,little,car,bad
...,...,...,...,...,...,...
995,female,own,little,,furniture/equipment,good
996,male,own,little,little,car,good
997,male,own,little,,radio/TV,good
998,male,free,little,little,radio/TV,bad


In [268]:
liste1  = [col for col in data.columns if data[col].dtypes == 'int64']
dictionaire1 = {}
for dict in liste1:
    dictionaire1[dict] = data[dict].values
#print(dictionaire)

Variables_numerique = pd.DataFrame(dictionaire1)

In [269]:
print(Variables_numerique)

     Age  Job  Credit amount  Duration
0     67    2           1169         6
1     22    2           5951        48
2     49    1           2096        12
3     45    2           7882        42
4     53    2           4870        24
..   ...  ...            ...       ...
995   31    1           1736        12
996   40    3           3857        30
997   38    2            804        12
998   23    2           1845        45
999   27    2           4576        45

[1000 rows x 4 columns]


In [270]:
bon, mal = 0,0
for i in data[data['Housing'] == 'own']['Risk'].values:
    if i == 'good':
        bon +=1
    else:
        mal += 1

print(f"pas de risque a :{(bon/data.shape[0])*100} %,selon la sa sution domiciale ")
print(f"le risque est a : {(mal/data.shape[0]*100)} %, selon sa sutiation domiciale ")

pas de risque a :52.7 %,selon la sa sution domiciale 
le risque est a : 18.6 %, selon sa sutiation domiciale 


In [271]:
bon, mal = 0,0
for i in data[data['Checking account'] == 'moderate']['Risk'].values:
    if i == 'good':
        bon +=1
    else:
        mal += 1

print(f"pas de risque a :{(bon/data.shape[0])*100} %,selon la sa sution domiciale ")
print(f"le risque est a : {(mal/data.shape[0]*100)} %, selon sa sutiation domiciale ")

pas de risque a :16.400000000000002 %,selon la sa sution domiciale 
le risque est a : 10.5 %, selon sa sutiation domiciale 


## Creation de pipeline

In [272]:
Variables_categoriques = Variables_categoriques.drop('Risk',axis=1)
Variables_categoriques.head()

Unnamed: 0,Sex,Housing,Saving accounts,Checking account,Purpose
0,male,own,,little,radio/TV
1,female,own,little,moderate,radio/TV
2,male,own,little,,education
3,male,free,little,little,furniture/equipment
4,male,free,little,little,car


In [273]:
num_var = list(Variables_numerique.columns)
cat_var = list(Variables_categoriques.columns)

In [274]:
pipeline_numerique = make_pipeline(SimpleImputer(strategy='mean'),StandardScaler())

pipeline_categorique = make_pipeline(SimpleImputer(strategy = 'most_frequent'),
                                     OneHotEncoder(handle_unknown='ignore'))

In [275]:
transformer = make_column_transformer((pipeline_numerique,num_var),
                                      (pipeline_categorique,cat_var))

In [276]:
X = data.drop('Risk',axis= 1)
Targets = (data[['Risk']])

In [277]:
X.shape

(1000, 9)

In [278]:
Targets.shape

(1000, 1)

In [279]:
# Diviser les données en ensembles d'entraînement et de test

X_train, X_test, y_train, y_test = train_test_split(X, Targets, test_size=0.2, random_state=42)

# Liste des modèles à tester
list_models = [RandomForestClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(),LogisticRegression(),SVC()]

# Transformer pour normaliser les données
#transformer1 = StandardScaler()

# Boucle sur les modèles
for model in list_models:
    # Créer le pipeline
    pipelinefinal = make_pipeline(transformer, model)
    
    # E
    # ntraîner le pipeline
    pipelinefinal.fit(X_train, y_train)
    
    # Évaluer la précision sur les données de test
    score = pipelinefinal.score(X_test, y_test)
    
    # Afficher le score du modèle
    print(f"Modèle : {model.__class__.__name__}")
    print(f"Score : {score}")
    print("************************")

  return fit_method(estimator, *args, **kwargs)


Modèle : RandomForestClassifier
Score : 0.695
************************
Modèle : DecisionTreeClassifier
Score : 0.635
************************
Modèle : KNeighborsClassifier
Score : 0.665
************************
Modèle : LogisticRegression
Score : 0.7
************************


  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Modèle : SVC
Score : 0.705
************************
