# Machine Learning

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
import os
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
df_train=pd.read_csv("./../data/application_train.csv",encoding="utf-8",index_col=0)
df_test=pd.read_csv("./../data/application_test.csv",encoding="utf-8",index_col=0)
df = pd.concat([df_train,df_test])

In [3]:
# comparaison entre df et df sans les nan

df_salvage_drop=df.dropna()
' '.join(['ratio:' , str(df_salvage_drop.shape[0]) ,'/' , str(df.shape[0]), '=' , str(round(df_salvage_drop.shape[0]/df.shape[0]*100,2)), '%'])

'ratio: 8602 / 356255 = 2.41 %'

In [4]:
# liste les valeurs par default de chaque clmn

def one_line_try(serie):
    try:
        return float(serie.mean())
    except:
        return float(np.nan)
    
default_value_per_clmn = {clmn : one_line_try(df_salvage_drop[clmn]) for clmn in df_salvage_drop}

default_value_per_clmn

{'TARGET': 0.06114857009997675,
 'NAME_CONTRACT_TYPE': nan,
 'CODE_GENDER': nan,
 'FLAG_OWN_CAR': nan,
 'FLAG_OWN_REALTY': nan,
 'CNT_CHILDREN': 0.6025342943501512,
 'AMT_INCOME_TOTAL': 222872.41443850266,
 'AMT_CREDIT': 699998.7016391536,
 'AMT_ANNUITY': 31558.948325970705,
 'AMT_GOODS_PRICE': 633833.7415717276,
 'NAME_TYPE_SUITE': nan,
 'NAME_INCOME_TYPE': nan,
 'NAME_EDUCATION_TYPE': nan,
 'NAME_FAMILY_STATUS': nan,
 'NAME_HOUSING_TYPE': nan,
 'REGION_POPULATION_RELATIVE': 0.023542625668449196,
 'DAYS_BIRTH': -14189.009416414787,
 'DAYS_EMPLOYED': -2299.091374099047,
 'DAYS_REGISTRATION': -4276.979190885841,
 'DAYS_ID_PUBLISH': -2975.926296210184,
 'OWN_CAR_AGE': 11.224133922343642,
 'FLAG_MOBIL': 1.0,
 'FLAG_EMP_PHONE': 0.9997674959311788,
 'FLAG_WORK_PHONE': 0.21111369448965356,
 'FLAG_CONT_MOBILE': 0.9968611950709138,
 'FLAG_PHONE': 0.3093466635666124,
 'FLAG_EMAIL': 0.12566844919786097,
 'OCCUPATION_TYPE': nan,
 'CNT_FAM_MEMBERS': 2.401418274819809,
 'REGION_RATING_CLIENT': 2.00

In [None]:
# applique ces valeur aux clmn

for clmn in df:
    df[clmn]= [A if str(A) != 'nan' else default_value_per_clmn[clmn] for A in df[clmn]]
df_cute_drop = df.dropna()
' '.join(['ratio:' , str(df_cute_drop.shape[0]) ,'/' , str(df.shape[0]), '=' ,str(round(df_cute_drop.shape[0]/df.shape[0]*100,2)), '%'])



In [None]:
del df_cute_drop
del df_salvage_drop
df = df.dropna()

In [None]:
df_true = df[df['TARGET'] == 1]
df_false = df[df['TARGET'] != 1]
maxi_val = min([df_false.shape[0], df_true.shape[0]])
print(' '.join(['loss: ',str(100 - round(2*maxi_val/df.shape[0]*100,2)), '%']))
df = pd.concat([df_false.iloc[:maxi_val],df_true.iloc[:maxi_val]])

In [None]:
colonnes=[colonne for colonne in df]
types_colonnes=list(map(lambda x : {x : list(set(df[x].map(lambda x : str(type(x)))))} , colonnes))

In [None]:
colonnes_str=list(filter(lambda x : x[list(x)[0]]==["<class 'str'>"], types_colonnes))

In [None]:
def vectorisation(col):
    voc=list(set(list(col)))
    val={voc[i]:i for i in range(len(voc))}
    return col.map(lambda x : val[x])



for i in [list(i)[0] for i in colonnes_str]:
    df[i]=vectorisation(df[i])

In [None]:
df

In [None]:
# matrice de correlation

total_corr = df.corr()
total_corr

In [None]:
# liste les meuilleurs correlations entre les clmns

def get_coor(mat, corr = .75):
    clmn_lst = mat.columns
    exeption = [] 
    val = 0
    for clmn in mat:
        cmd = lambda idx, val: [print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val)),  exeption.append(val)]
        [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if val > corr and val != 1 and val not in exeption]

        

get_coor(total_corr, .99)

In [None]:
# liste les meuilleurs corelations par rapport a une clmn

def get_clmn_corr(mat, clmn, corr = 0.1):
    serie = mat[clmn]
    clmn_lst = list(serie.keys())
    cmd = lambda idx, val: print("{} -> {} : ".format(clmn, clmn_lst[idx]).ljust(80, ' ') + str(val))
    [cmd(idx, val) for idx, val in enumerate(mat[clmn]) if abs(val) > corr and val != 1]
    
get_clmn_corr(total_corr, 'TARGET', 0.05)

In [None]:
#set(df_train['FONDKAPREMONT_MODE'].fillna("").map(type))

In [None]:
#sorted(list(df.corr()['TARGET']))

In [None]:
from machinelearning import prediction


file=open("machinelearning.py","r",encoding="utf-8")
#print(file.read())
file.close()
del file

In [None]:
# harmonise les types
df = df.applymap(float)
df['TARGET'] = df['TARGET'].apply(int)

In [None]:
prediction(df,models=[{"modèle":LinearSVC,"paramètres":{"random_state":44}},
                       {"modèle":RandomForestClassifier,"paramètres":{"n_estimators":750,"random_state":44}},
                       {"modèle":GradientBoostingClassifier,"paramètres":{"random_state":44}},
                       {"modèle":LogisticRegression,"paramètres":{"random_state":44}}
                     ])

In [None]:
100 - (df['TARGET'].sum()/df.shape[0])*100

In [None]:
df['TARGET'].value_counts(dropna=False)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()

'''
xg_reg = xgb.XGBRegressor(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
                '''
#xg_reg.fit(X_train,y_train)
model.fit(X_train,y_train)
print(model)
#preds = xg_reg.predict(X_test)
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
import xgboost as xgb