# Machine Learning

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
import os
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
df_train=pd.read_csv("./../data/application_train.csv",encoding="utf-8",index_col=0)
df_test=pd.read_csv("./../data/application_test.csv",encoding="utf-8",index_col=0)
df = pd.concat([df_train,df_test])

In [3]:
# comparaison entre df et df sans les nan

df_salvage_drop=df.dropna()
' '.join(['ratio:' , str(df_salvage_drop.shape[0]) ,'/' , str(df.shape[0]), '=' , str(round(df_salvage_drop.shape[0]/df.shape[0]*100,2)), '%'])

'ratio: 8602 / 356255 = 2.41 %'

In [4]:
# liste les valeurs par default de chaque clmn

def one_line_try(serie):
    try:
        return float(serie.mean())
    except:
        return float(np.nan)
    
default_value_per_clmn = {clmn : one_line_try(df_salvage_drop[clmn]) for clmn in df_salvage_drop}

default_value_per_clmn

{'TARGET': 0.06114857009997675,
 'NAME_CONTRACT_TYPE': nan,
 'CODE_GENDER': nan,
 'FLAG_OWN_CAR': nan,
 'FLAG_OWN_REALTY': nan,
 'CNT_CHILDREN': 0.6025342943501512,
 'AMT_INCOME_TOTAL': 222872.41443850266,
 'AMT_CREDIT': 699998.7016391536,
 'AMT_ANNUITY': 31558.948325970705,
 'AMT_GOODS_PRICE': 633833.7415717276,
 'NAME_TYPE_SUITE': nan,
 'NAME_INCOME_TYPE': nan,
 'NAME_EDUCATION_TYPE': nan,
 'NAME_FAMILY_STATUS': nan,
 'NAME_HOUSING_TYPE': nan,
 'REGION_POPULATION_RELATIVE': 0.023542625668449196,
 'DAYS_BIRTH': -14189.009416414787,
 'DAYS_EMPLOYED': -2299.091374099047,
 'DAYS_REGISTRATION': -4276.979190885841,
 'DAYS_ID_PUBLISH': -2975.926296210184,
 'OWN_CAR_AGE': 11.224133922343642,
 'FLAG_MOBIL': 1.0,
 'FLAG_EMP_PHONE': 0.9997674959311788,
 'FLAG_WORK_PHONE': 0.21111369448965356,
 'FLAG_CONT_MOBILE': 0.9968611950709138,
 'FLAG_PHONE': 0.3093466635666124,
 'FLAG_EMAIL': 0.12566844919786097,
 'OCCUPATION_TYPE': nan,
 'CNT_FAM_MEMBERS': 2.401418274819809,
 'REGION_RATING_CLIENT': 2.00

In [5]:
# applique ces valeur aux clmn

for clmn in df:
    df[clmn]= [A if str(A) != 'nan' else default_value_per_clmn[clmn] for A in df[clmn]]
df_cute_drop = df.dropna()
' '.join(['ratio:' , str(df_cute_drop.shape[0]) ,'/' , str(df.shape[0]), '=' ,str(round(df_cute_drop.shape[0]/df.shape[0]*100,2)), '%'])



'ratio: 76257 / 356255 = 21.41 %'

In [6]:
del df_cute_drop
del df_salvage_drop
df = df.dropna()

In [7]:
df_true = df[df['TARGET'] == 1]
df_false = df[df['TARGET'] != 1]
maxi_val = min([df_false.shape[0], df_true.shape[0]])
print(' '.join(['loss: ',str(100 - round(2*maxi_val/df.shape[0]*100,2)), '%']))
df = pd.concat([df_false.iloc[:maxi_val],df_true.iloc[:maxi_val]])

loss:  86.92 %


In [8]:
colonnes=[colonne for colonne in df]
types_colonnes=list(map(lambda x : {x : list(set(df[x].map(lambda x : str(type(x)))))} , colonnes))

In [9]:
colonnes_str=list(filter(lambda x : x[list(x)[0]]==["<class 'str'>"], types_colonnes))

In [10]:
def vectorisation(col):
    voc=list(set(list(col)))
    val={voc[i]:i for i in range(len(voc))}
    return col.map(lambda x : val[x])



for i in [list(i)[0] for i in colonnes_str]:
    df[i]=vectorisation(df[i])

In [11]:
df

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100003,0.0,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
100017,0.0,0,1,1,0,1,225000.0,918468.0,28966.5,697500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000
100018,0.0,0,0,0,1,0,189000.0,773680.5,32778.0,679500.0,...,0,0,0,0,0.005813,0.005929,0.039061,0.368984,0.25808,1.757615
100022,0.0,1,0,0,1,0,112500.0,157500.0,7875.0,157500.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
100024,0.0,1,1,1,1,0,135000.0,427500.0,21375.0,427500.0,...,0,0,0,0,0.005813,0.005929,0.039061,0.368984,0.25808,1.757615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456140,1.0,0,0,1,1,1,261000.0,711454.5,47673.0,643500.0,...,0,0,0,0,0.000000,0.000000,0.000000,1.000000,0.00000,1.000000
456176,1.0,0,1,0,1,0,171000.0,1436850.0,42142.5,1125000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,1.00000,2.000000
456184,1.0,0,1,0,0,0,270000.0,900000.0,40671.0,900000.0,...,0,0,0,0,0.005813,0.005929,0.039061,0.368984,0.25808,1.757615
456186,1.0,0,1,0,0,1,207000.0,450000.0,32746.5,450000.0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.00000,2.000000


In [12]:
# matrice de correlation

total_corr = df.corr()
total_corr

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
TARGET,1.000000,-0.076368,0.083076,-0.045207,-0.003037,0.008166,-0.065584,-0.089962,-0.062395,-0.109504,...,-0.010269,-3.183394e-16,-0.010015,-2.151853e-17,-0.003664,0.013607,-0.009777,-0.028425,0.000195,0.053050
NAME_CONTRACT_TYPE,-0.076368,1.000000,-0.023958,-0.016032,0.086474,0.018623,-0.038181,-0.241676,-0.280797,-0.203955,...,0.018823,-6.335111e-03,-0.006335,7.757107e-02,-0.012302,-0.006991,-0.013452,-0.001101,-0.024344,-0.067508
CODE_GENDER,0.083076,-0.023958,1.000000,0.311111,-0.038137,-0.027431,0.164867,-0.002558,0.064968,-0.003640,...,0.010392,3.810472e-03,-0.006384,2.964151e-02,0.005548,0.009882,-0.009074,0.017880,-0.009070,-0.010620
FLAG_OWN_CAR,-0.045207,-0.016032,0.311111,1.000000,0.007561,0.060699,0.200192,0.109487,0.140540,0.118180,...,-0.017089,-3.911865e-03,0.006639,8.131844e-03,0.004752,0.017278,0.013739,0.008165,0.006615,-0.029415
FLAG_OWN_REALTY,-0.003037,0.086474,-0.038137,0.007561,1.000000,0.036736,0.010766,-0.048269,-0.010102,-0.054938,...,-0.098408,-8.258858e-03,-0.029925,-1.011601e-02,-0.003508,-0.015120,-0.001548,0.007308,0.013388,0.057511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,0.013607,-0.006991,0.009882,0.017278,-0.015120,-0.005123,0.003018,0.028402,0.019600,0.028599,...,-0.006650,-1.488258e-03,-0.001488,-1.524181e-03,0.189249,1.000000,0.145615,-0.014631,-0.009983,-0.010331
AMT_REQ_CREDIT_BUREAU_WEEK,-0.009777,-0.013452,-0.009074,0.013739,-0.001548,0.004504,0.010109,0.015039,0.022610,0.017071,...,-0.006119,-3.854823e-03,-0.003855,-3.871510e-03,0.017105,0.145615,1.000000,-0.019023,-0.008147,0.015091
AMT_REQ_CREDIT_BUREAU_MON,-0.028425,-0.001101,0.017880,0.008165,0.007308,-0.014753,0.054454,0.060553,0.031912,0.060188,...,0.006661,-6.818680e-03,-0.006819,-6.724951e-03,-0.008424,-0.014631,-0.019023,1.000000,-0.009332,-0.007068
AMT_REQ_CREDIT_BUREAU_QRT,0.000195,-0.024344,-0.009070,0.006615,0.013388,-0.035051,0.008259,0.004844,0.006417,0.009166,...,-0.001571,8.186572e-03,0.016808,-9.276451e-03,-0.002873,-0.009983,-0.008147,-0.009332,1.000000,0.098200


In [13]:
# liste les meuilleurs correlations entre les clmns

def get_coor(mat, corr = .75):
    clmn_lst = mat.columns
    exeption = [] 
    val = 0
    for clmn in mat:
        cmd = lambda idx, val: [print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val)),  exeption.append(val)]
        [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if val > corr and val != 1 and val not in exeption]

        

get_coor(total_corr, .99)

APARTMENTS_AVG -> APARTMENTS_MEDI :                                             0.9923374126876903
BASEMENTAREA_AVG -> BASEMENTAREA_MEDI :                                         0.9979732480176327
YEARS_BUILD_AVG -> YEARS_BUILD_MEDI :                                           0.9981402495466494
COMMONAREA_AVG -> COMMONAREA_MEDI :                                             0.9918264904546211
ELEVATORS_AVG -> ELEVATORS_MEDI :                                               0.9951236464376788
ENTRANCES_AVG -> ENTRANCES_MEDI :                                               0.9960679143905282
FLOORSMAX_AVG -> FLOORSMAX_MEDI :                                               0.9963645514588235
FLOORSMIN_AVG -> FLOORSMIN_MEDI :                                               0.9972851076606262
LIVINGAPARTMENTS_AVG -> LIVINGAPARTMENTS_MEDI :                                 0.9980351667196324
LIVINGAREA_AVG -> LIVINGAREA_MEDI :                                             0.9932679423982816
NONLIVINGA

In [14]:
# liste les meuilleurs corelations par rapport a une clmn

def get_clmn_corr(mat, clmn, corr = 0.1):
    serie = mat[clmn]
    clmn_lst = list(serie.keys())
    cmd = lambda idx, val: print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val))
    [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if abs(val) > corr and val != 1]
    
get_clmn_corr(total_corr, 'TARGET', 0.05)

TARGET -> NAME_CONTRACT_TYPE :                                                  -0.07636751580205459
TARGET -> CODE_GENDER :                                                         0.08307585314842651
TARGET -> AMT_INCOME_TOTAL :                                                    -0.06558425201674299
TARGET -> AMT_CREDIT :                                                          -0.08996211842134605
TARGET -> AMT_ANNUITY :                                                         -0.06239487082565294
TARGET -> AMT_GOODS_PRICE :                                                     -0.10950448169476901
TARGET -> NAME_INCOME_TYPE :                                                    0.08514864460693324
TARGET -> NAME_EDUCATION_TYPE :                                                 -0.11305836445747836
TARGET -> REGION_POPULATION_RELATIVE :                                          -0.07190882677726608
TARGET -> DAYS_BIRTH :                                                          0.13225816160

In [15]:
#set(df_train['FONDKAPREMONT_MODE'].fillna("").map(type))

In [16]:
#sorted(list(df.corr()['TARGET']))

In [17]:
from machinelearning import prediction


file=open("machinelearning.py","r",encoding="utf-8")
#print(file.read())
file.close()
del file

In [18]:
# harmonise les types
df = df.applymap(float)
df['TARGET'] = df['TARGET'].apply(int)

  and should_run_async(code)


In [19]:
prediction(df,models=[{"modèle":LinearSVC,"paramètres":{"random_state":44}},
                       {"modèle":RandomForestClassifier,"paramètres":{"n_estimators":750,"random_state":44}},
                       {"modèle":GradientBoostingClassifier,"paramètres":{"random_state":44}},
                       {"modèle":LogisticRegression,"paramètres":{"random_state":44}},
                      {"modèle":XGBClassifier,"paramètres":{}}
                     ])

création des échantillons
Apprentissage des modèles




La précision du modèle LinearSVC(random_state=44) est : 59.3%
La précision du modèle RandomForestClassifier(n_estimators=750, random_state=44) est : 77.59%
La précision du modèle GradientBoostingClassifier(random_state=44) est : 73.94%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


La précision du modèle LogisticRegression(random_state=44) est : 57.66%
La précision du modèle XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) est : 76.86%
