In [17]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
train = pd.read_csv("./home-credit-default-risk/application_train.csv")
test = pd.read_csv("./home-credit-default-risk/application_test.csv")

target = train['TARGET']
features_name = test.columns

In [19]:
submitID = test[['SK_ID_CURR']]

In [20]:
df = train.append(test)

In [21]:
df = df[df['CODE_GENDER'] != 'XNA']
df = df[df['NAME_FAMILY_STATUS'] != 'Unknown']
df = df[df['NAME_INCOME_TYPE'] != 'Maternity leave']

In [22]:
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
df['DAYS_EMPLOYED_ABNORMAL'] = (df['DAYS_EMPLOYED'] == np.nan)

In [23]:
df['RATIO_DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['RATIO_INCOME_CREDIT'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['RATIO_INCOME_FAM_MEMBERS'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['RATIO_ANNUITY_INCOME'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['RATIO_ANNUITY_CREDIT'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

In [24]:
dfd = pd.get_dummies(df)

In [25]:
train = dfd[dfd['TARGET'].isnull() == False]
test = dfd[dfd['TARGET'].isnull()]
target = train['TARGET']
train = train.drop(columns=['TARGET'])
test = test.drop(columns=['TARGET'])
features_name = test.columns

In [26]:
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline

In [27]:
pipe_ada = Pipeline([('imputer', Imputer(strategy = 'median')),
                    ('scaler', StandardScaler()),
                    ('clf', AdaBoostClassifier(n_estimators=100))
                    ])

In [30]:
pipe_ada.fit(train, target)

Pipeline(memory=None,
         steps=[('imputer',
                 Imputer(axis=0, copy=True, missing_values='NaN',
                         strategy='median', verbose=0)),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=100,
                                    random_state=None))],
         verbose=False)

In [31]:
pipe_ada.predict_proba(test)

array([[0.50883821, 0.49116179],
       [0.50569818, 0.49430182],
       [0.5112577 , 0.4887423 ],
       ...,
       [0.5082899 , 0.4917101 ],
       [0.50701893, 0.49298107],
       [0.50322683, 0.49677317]])

In [11]:
scaler = StandardScaler()
imputer = Imputer(strategy = 'median')

In [12]:
train = imputer.fit_transform(train)
test = imputer.transform(test)

train = scaler.fit_transform(train)
test = scaler.transform(test)

In [13]:
target.shape

(307500,)

In [14]:
lr = LogisticRegression()
lr.fit(train, target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:

submit = submitID
submit['TARGET'] = lr.predict_proba(test)[:,1]
submit.to_csv('submit_baseline_lr.csv', index=False)

In [18]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 100, max_depth=15, n_jobs = -1)
random_forest.fit(train, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [19]:
feature_importance = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features_name, 'importance': feature_importance})

In [20]:
feature_importances = feature_importances.set_index('feature')

In [21]:
feature_importances.sort_values(by='importance', ascending=False)[:20]

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
EXT_SOURCE_3,0.084673
EXT_SOURCE_2,0.082805
EXT_SOURCE_1,0.034511
DAYS_EMPLOYED,0.026239
DAYS_BIRTH,0.025634
RATIO_ANNUITY_CREDIT,0.023067
RATIO_DAYS_EMPLOYED,0.022278
DAYS_ID_PUBLISH,0.021069
DAYS_LAST_PHONE_CHANGE,0.020169
DAYS_REGISTRATION,0.018892


In [22]:
submit = submitID
submit['TARGET'] = random_forest.predict_proba(test)[:,1]

In [23]:
submit.to_csv('submit_baseline.csv', index=False)

In [24]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(train, target)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

In [26]:
submit['TARGET'] = clf.predict_proba(test)[:,1]

In [27]:
submit.to_csv('submit_baseline_ada.csv', index=False)

In [2]:
test

NameError: name 'test' is not defined