# Machine Learning

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
import os
from sklearn.linear_model import LogisticRegression

In [2]:
df_train=pd.read_csv("./../data/application_train.csv",encoding="utf-8",index_col=0)
df_test=pd.read_csv("./../data/application_test.csv",encoding="utf-8",index_col=0)
df = pd.concat([df_train,df_test])

In [3]:
# comparaison entre df et df sans les nan

df_salvage_drop=df.dropna()
' '.join(['ratio:' , str(df_salvage_drop.shape[0]) ,'/' , str(df.shape[0]), '=' , str(round(df_salvage_drop.shape[0]/df.shape[0]*100,2)), '%'])

'ratio: 8602 / 356255 = 2.41 %'

In [4]:
# liste les valeurs par default de chaque clmn

def one_line_try(serie):
    try:
        return float(serie.mean())
    except:
        return float(np.nan)
    
default_value_per_clmn = {clmn : one_line_try(df_salvage_drop[clmn]) for clmn in df_salvage_drop}

default_value_per_clmn

{'TARGET': 0.06114857009997675,
 'NAME_CONTRACT_TYPE': nan,
 'CODE_GENDER': nan,
 'FLAG_OWN_CAR': nan,
 'FLAG_OWN_REALTY': nan,
 'CNT_CHILDREN': 0.6025342943501512,
 'AMT_INCOME_TOTAL': 222872.41443850266,
 'AMT_CREDIT': 699998.7016391536,
 'AMT_ANNUITY': 31558.948325970705,
 'AMT_GOODS_PRICE': 633833.7415717276,
 'NAME_TYPE_SUITE': nan,
 'NAME_INCOME_TYPE': nan,
 'NAME_EDUCATION_TYPE': nan,
 'NAME_FAMILY_STATUS': nan,
 'NAME_HOUSING_TYPE': nan,
 'REGION_POPULATION_RELATIVE': 0.023542625668449196,
 'DAYS_BIRTH': -14189.009416414787,
 'DAYS_EMPLOYED': -2299.091374099047,
 'DAYS_REGISTRATION': -4276.979190885841,
 'DAYS_ID_PUBLISH': -2975.926296210184,
 'OWN_CAR_AGE': 11.224133922343642,
 'FLAG_MOBIL': 1.0,
 'FLAG_EMP_PHONE': 0.9997674959311788,
 'FLAG_WORK_PHONE': 0.21111369448965356,
 'FLAG_CONT_MOBILE': 0.9968611950709138,
 'FLAG_PHONE': 0.3093466635666124,
 'FLAG_EMAIL': 0.12566844919786097,
 'OCCUPATION_TYPE': nan,
 'CNT_FAM_MEMBERS': 2.401418274819809,
 'REGION_RATING_CLIENT': 2.00

In [5]:
# applique ces valeur aux clmn

for clmn in df:
    df[clmn]= [A if str(A) != 'nan' else default_value_per_clmn[clmn] for A in df[clmn]]
df_cute_drop = df.dropna()
' '.join(['ratio:' , str(df_cute_drop.shape[0]) ,'/' , str(df.shape[0]), '=' ,str(round(df_cute_drop.shape[0]/df.shape[0]*100,2)), '%'])



'ratio: 76257 / 356255 = 21.41 %'

In [6]:
del df_cute_drop
del df_salvage_drop
df = df.dropna()

In [7]:
colonnes=[colonne for colonne in df]
types_colonnes=list(map(lambda x : {x : list(set(df[x].map(lambda x : str(type(x)))))} , colonnes))

In [8]:
colonnes_str=list(filter(lambda x : x[list(x)[0]]==["<class 'str'>"], types_colonnes))

In [9]:
def vectorisation(col):
    voc=list(set(list(col)))
    val={voc[i]:i for i in range(len(voc))}
    return col.map(lambda x : val[x])



for i in [list(i)[0] for i in colonnes_str]:
    df[i]=vectorisation(df[i])

In [10]:
df

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1.000000,0,0,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000
100003,0.000000,0,2,0,0,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
100017,0.000000,0,0,1,0,1,225000.0,918468.0,28966.5,697500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000
100018,0.000000,0,2,0,1,0,189000.0,773680.5,32778.0,679500.0,...,0,0,0,0,0.005813,0.005929,0.039061,0.368984,0.25808,1.757615
100022,0.000000,1,2,0,1,0,112500.0,157500.0,7875.0,157500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455962,0.061149,0,0,0,1,0,112500.0,156384.0,16551.0,135000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
456007,0.061149,0,0,0,1,0,135000.0,318645.0,16398.0,216000.0,...,0,0,0,0,0.005813,0.005929,0.039061,0.368984,0.25808,1.757615
456010,0.061149,0,2,0,1,1,112500.0,690313.5,35374.5,549000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
456111,0.061149,0,2,0,1,1,112500.0,514710.0,17707.5,387000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


In [11]:
# matrice de correlation

total_corr = df.corr()
total_corr

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
TARGET,1.000000,-0.035657,-0.039148,-0.026669,-0.002435,0.002472,-0.030196,-0.035761,-0.025404,-0.044127,...,-0.005482,-0.002635,-0.006345,0.002856,0.002221,-0.000385,0.000526,-0.007918,-0.002330,0.023533
NAME_CONTRACT_TYPE,-0.035657,1.000000,0.001520,0.004779,0.080169,0.020061,-0.010172,-0.216338,-0.244626,-0.180204,...,-0.006361,-0.005942,-0.009065,0.057280,0.003289,-0.000511,-0.003331,-0.006479,-0.044146,-0.063741
CODE_GENDER,-0.039148,0.001520,1.000000,-0.312183,0.041115,-0.003561,-0.168367,0.004023,-0.056979,0.001507,...,-0.020284,-0.004415,-0.001221,-0.024290,0.004752,0.003089,0.000586,-0.010693,0.001663,0.013639
FLAG_OWN_CAR,-0.026669,0.004779,-0.312183,1.000000,0.004513,0.074580,0.200152,0.102224,0.130681,0.108375,...,-0.004653,-0.002119,0.004555,0.007292,-0.000067,0.002919,0.004172,0.014719,-0.004944,-0.035069
FLAG_OWN_REALTY,-0.002435,0.080169,0.041115,0.004513,1.000000,0.017318,0.011796,-0.041280,0.003032,-0.049967,...,-0.093550,-0.015375,-0.029138,-0.001105,-0.003708,-0.013641,0.003726,-0.005638,0.014125,0.061147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,-0.000385,-0.000511,0.003089,0.002919,-0.013641,-0.001991,0.005739,0.007857,0.000932,0.008805,...,0.022220,-0.001473,-0.001721,-0.000996,0.227124,1.000000,0.203822,-0.006314,-0.008609,-0.005733
AMT_REQ_CREDIT_BUREAU_WEEK,0.000526,-0.003331,0.000586,0.004172,0.003726,0.003964,0.001566,-0.001255,0.003242,-0.000718,...,0.000770,-0.000926,-0.002061,-0.002556,0.006853,0.203822,1.000000,-0.009622,-0.025851,0.017829
AMT_REQ_CREDIT_BUREAU_MON,-0.007918,-0.006479,-0.010693,0.014719,-0.005638,-0.014412,0.039717,0.055058,0.023875,0.053419,...,-0.001303,-0.003055,-0.000640,-0.003034,0.001974,-0.006314,-0.009622,1.000000,-0.033454,-0.005032
AMT_REQ_CREDIT_BUREAU_QRT,-0.002330,-0.044146,0.001663,-0.004944,0.014125,-0.004811,0.014132,0.001748,0.018253,0.001467,...,-0.009800,-0.002342,-0.007210,-0.003213,-0.005860,-0.008609,-0.025851,-0.033454,1.000000,0.089852


In [12]:
# liste les meuilleurs correlations entre les clmns

def get_coor(mat, corr = .75):
    clmn_lst = mat.columns
    exeption = [] 
    val = 0
    for clmn in mat:
        cmd = lambda idx, val: [print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val)),  exeption.append(val)]
        [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if val > corr and val != 1 and val not in exeption]

        

get_coor(total_corr, .99)

APARTMENTS_AVG -> APARTMENTS_MEDI :                                             0.9932438593211869
BASEMENTAREA_AVG -> BASEMENTAREA_MEDI :                                         0.9927247449440134
YEARS_BUILD_AVG -> YEARS_BUILD_MEDI :                                           0.9985788408274076
COMMONAREA_AVG -> COMMONAREA_MEDI :                                             0.9962747620011316
ELEVATORS_AVG -> ELEVATORS_MEDI :                                               0.9951388143651847
ENTRANCES_AVG -> ENTRANCES_MEDI :                                               0.995988870097683
FLOORSMAX_AVG -> FLOORSMAX_MEDI :                                               0.9965775523651844
FLOORSMIN_AVG -> FLOORSMIN_MEDI :                                               0.9972900790581647
LIVINGAPARTMENTS_AVG -> LIVINGAPARTMENTS_MEDI :                                 0.9931787240171308
LIVINGAREA_AVG -> LIVINGAREA_MEDI :                                             0.9938965517102964
OBS_30_CNT_

In [13]:
# liste les meuilleurs corelations par rapport a une clmn

def get_clmn_corr(mat, clmn, corr = 0.1):
    serie = mat[clmn]
    clmn_lst = list(serie.keys())
    cmd = lambda idx, val: print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val))
    [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if abs(val) > corr and val != 1]
    
get_clmn_corr(total_corr, 'TARGET', 0.05)

TARGET -> NAME_EDUCATION_TYPE :                                                 0.05641227873085527
TARGET -> DAYS_BIRTH :                                                          0.05494816738908064
TARGET -> DAYS_EMPLOYED :                                                       0.06657249024742615
TARGET -> REGION_RATING_CLIENT :                                                0.05003362632505174
TARGET -> REGION_RATING_CLIENT_W_CITY :                                         0.05293702444177632
TARGET -> EXT_SOURCE_1 :                                                        -0.09936737411717524
TARGET -> EXT_SOURCE_2 :                                                        -0.1455506138095144
TARGET -> EXT_SOURCE_3 :                                                        -0.14250226945392638


In [14]:
#set(df_train['FONDKAPREMONT_MODE'].fillna("").map(type))

In [15]:
#sorted(list(df.corr()['TARGET']))

In [16]:
from machinelearning import prediction


file=open("machinelearning.py","r",encoding="utf-8")
print(file.read())
file.close()
del file




# -*- coding: utf-8 -*-
"""
Created on Sat Oct 17 11:54:10 2020

@author: minimilien
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

def score_personnalisé(prediction,realite):
    VP_FP_FN=list(filter(lambda x : (x[0]==0 and x[1]==0)==False ,zip(prediction,realite)))
    #print("Taille de la liste :",len(realite))
    #print("Taille de la liste sans les vrais négatifs:",len(VP_FP_FN))
    VP=len(list(filter(lambda x : x[0]==x[1] ,VP_FP_FN)))
    return round(VP*100/len(VP_FP_FN),2)

def prediction(dataframe,
               target='TARGET',
               models=[LinearSVC(),
                       RandomForestClassifier(n_estimators=750),
                       GradientBoostingClassifier(),
                       LogisticRegression()
                      ]
       

In [17]:
# harmonise les types
df = df.applymap(float)
df['TARGET'] = df['TARGET'].apply(int)

In [None]:
prediction(df)

création des échantillons
Apprentissage des modèles



Liblinear failed to converge, increase the number of iterations.



In [None]:
100 - (df['TARGET'].sum()/df.shape[0])*100