# Import packages

In [1]:
import os
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression

# Get the main training and testing data

In [2]:
data_loc = r"D:\git_personal\Home-Credit-Default-Risk\data/"
app_train = pd.read_csv(data_loc + 'application_train.csv')
app_test = pd.read_csv(data_loc + 'application_test.csv')

# EDA

## Check data size

In [25]:
print('Size of test dataset is ', app_train.shape)
app_train.head()

Size of test dataset is  (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
print('Size of test dataset is ', app_test.shape)
app_test.head()

Size of test dataset is  (48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


## Check distribution of response variable

In [28]:
app_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

## Check missing values

In [39]:
train_missing_counts = app_train.isnull().sum()
print('In training dataset, there are total of', len(train_missing_counts[train_missing_counts!=0]), 
      'out of', app_train.shape[1], 'columns having missing values')

In training dataset, there are total of 67 out of 122 columns having missing values


In [40]:
test_missing_counts = app_test.isnull().sum()
print('In test dataset, there are total of', len(test_missing_counts[test_missing_counts!=0]), 
      'out of', app_test.shape[1], 'columns having missing values')

In test dataset, there are total of 64 out of 121 columns having missing values


In [54]:
print('Check value type of categorical variables to see if there are any ordinal variables')
categorical_vals = app_train.dtypes[app_train.dtypes=='object'].index.tolist()
for val in categorical_vals:
    print(app_train[val].value_counts())

Check value type of categorical variables to see if there are any ordinal variables
Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64
F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64
N    202924
Y    104587
Name: FLAG_OWN_CAR, dtype: int64
Y    213312
N     94199
Name: FLAG_OWN_REALTY, dtype: int64
Unaccompanied      248526
Family              40149
Spouse, partner     11370
Children             3267
Other_B              1770
Other_A               866
Group of people       271
Name: NAME_TYPE_SUITE, dtype: int64
Working                 158774
Commercial associate     71617
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: NAME_INCOME_TYPE, dtype: int64
Secondary / secondary special    218391
Higher education                  74863
Incomplete higher                 10277
Lower secon

## Create dummies for categorical variables

In [3]:
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

In [57]:
print('Training data after creating dummies:', app_train.shape)
print('Test data after creating dummies:', app_test.shape)

Training data after creating dummies: (307511, 246)
Test data after creating dummies: (48744, 242)


### Align Train and Test data so they have the same set of features

In [4]:
# take target out
response = app_train['TARGET']
# align
app_train, app_test = app_train.align(app_test, join='inner', axis = 1)
# add target back to train
app_train['TARGET'] = response

In [65]:
print('Training data after aligning:', app_train.shape)
print('Test data after aligning:', app_test.shape)

Training data after aligning: (307511, 243)
Test data after aligning: (48744, 242)


## Check on correlation

In [74]:
correlations = app_train.corr()['TARGET'].abs().sort_values(ascending=False)
correlations.head(10)

TARGET                                  1.000000
EXT_SOURCE_3                            0.178919
EXT_SOURCE_2                            0.160472
EXT_SOURCE_1                            0.155317
DAYS_BIRTH                              0.078239
REGION_RATING_CLIENT_W_CITY             0.060893
REGION_RATING_CLIENT                    0.058899
NAME_INCOME_TYPE_Working                0.057481
NAME_EDUCATION_TYPE_Higher education    0.056593
DAYS_LAST_PHONE_CHANGE                  0.055218
Name: TARGET, dtype: float64

# Simple Model Building

## Logistic Regression

In [5]:
train = app_train.drop(columns=['TARGET'])
test = app_test.copy()

# missing imputation
imputer = Imputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

NotFittedError: This MinMaxScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [6]:
# scale variables to [0,1]
scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

# training
lr = LogisticRegression()
lr.fit(train, response)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
test_pred = lr.predict_proba(test)[:,1]

In [21]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = test_pred
submit.head()
submit.to_csv(data_loc+'toy_sub.csv', index = False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [1]:
list('abcd')

['a', 'b', 'c', 'd']