## Importing Necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier
from collections import Counter

## Loading Dataset

In [2]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

In [3]:
train.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,450407,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0,0
1,271298,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,...,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,0
2,122238,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,...,0,0,0,0.0,0.0,0.0,2.0,0.0,1.0,0
3,305311,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0
4,414121,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0


In [4]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294,Cash loans,F,N,Y,0,180000.0,265306.5,25317.0,252000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,2.0,5.0
1,439847,Cash loans,F,N,Y,0,202500.0,346500.0,21069.0,346500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562,Cash loans,M,Y,N,0,360000.0,545040.0,36553.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,5.0
3,407238,Cash loans,F,N,Y,0,135000.0,307557.0,20682.0,265500.0,...,0,0,0,0,0.0,0.0,1.0,0.0,1.0,2.0
4,239910,Cash loans,F,N,Y,0,157500.0,1056447.0,31018.5,922500.0,...,0,0,0,0,,,,,,


## Removing Columns having null values greater than 60%

In [5]:
train_data = train.dropna(axis = 1, thresh = 0.4 * (len(train)))
test_data = test.dropna(axis = 1, thresh = 0.4 * (len(test)))

In [6]:
train_data.shape

(184506, 105)

In [7]:
test_data.shape

(123005, 104)

In [8]:
y_train = train_data['TARGET']
train_data = train_data.drop('TARGET', axis = 1)

## Detecting and Replacing outliers (Flooring and Caping)

In [9]:
Q1 = train_data.quantile(0.25)
Q3 = train_data.quantile(0.75)
IQR = Q3 - Q1

### Detecting train_data outliers

In [10]:
train_cols = train_data.dtypes != object
train_cols = train_data.columns[train_cols].tolist()
for i in range(len(train_cols)):
    q1 = Q1[train_cols[i]]
    q3 = Q3[train_cols[i]]
    iqr = IQR[train_cols[i]]
    outliers = train_data[((train_data[train_cols[i]] < (q1 - 1.5 * iqr)) | (train_data[train_cols[i]] > (q3 + 1.5 * iqr)))]
    print("{} : {}\n".format(train_cols[i], outliers.shape[0]))

SK_ID_CURR : 0

CNT_CHILDREN : 2502

AMT_INCOME_TOTAL : 8393

AMT_CREDIT : 3936

AMT_ANNUITY : 4464

AMT_GOODS_PRICE : 8914

REGION_POPULATION_RELATIVE : 5009

DAYS_BIRTH : 0

DAYS_EMPLOYED : 43402

DAYS_REGISTRATION : 373

DAYS_ID_PUBLISH : 0

FLAG_MOBIL : 1

FLAG_EMP_PHONE : 33239

FLAG_WORK_PHONE : 36738

FLAG_CONT_MOBILE : 361

FLAG_PHONE : 0

FLAG_EMAIL : 10450

CNT_FAM_MEMBERS : 2360

REGION_RATING_CLIENT : 48161

REGION_RATING_CLIENT_W_CITY : 46695

HOUR_APPR_PROCESS_START : 1346

REG_REGION_NOT_LIVE_REGION : 2800

REG_REGION_NOT_WORK_REGION : 9465

LIVE_REGION_NOT_WORK_REGION : 7597

REG_CITY_NOT_LIVE_CITY : 14535

REG_CITY_NOT_WORK_CITY : 42496

LIVE_CITY_NOT_WORK_CITY : 33074

EXT_SOURCE_1 : 0

EXT_SOURCE_2 : 0

EXT_SOURCE_3 : 0

APARTMENTS_AVG : 6312

BASEMENTAREA_AVG : 4336

YEARS_BEGINEXPLUATATION_AVG : 2897

ELEVATORS_AVG : 6168

ENTRANCES_AVG : 2326

FLOORSMAX_AVG : 3149

LANDAREA_AVG : 4140

LIVINGAREA_AVG : 7440

NONLIVINGAREA_AVG : 10015

APARTMENTS_MODE : 6163

BASEM

### Replacing train_data outliers

In [11]:
for i in range(len(train_cols)):
    q1 = Q1[train_cols[i]]
    q3 = Q3[train_cols[i]]
    iqr = IQR[train_cols[i]]
    uw = q3 + 1.5 * iqr
    lw = q1 - 1.5 * iqr
    train_data[train_cols[i]] = np.where(train_data[train_cols[i]] > uw, uw, train_data[train_cols[i]])
    train_data[train_cols[i]] = np.where(train_data[train_cols[i]] < lw, lw, train_data[train_cols[i]])
    outliers = train_data[((train_data[train_cols[i]] < (q1 - 1.5 * iqr)) | (train_data[train_cols[i]] > (q3 + 1.5 * iqr)))]
    print("{} : {}\n".format(train_cols[i], outliers.shape[0]))

SK_ID_CURR : 0

CNT_CHILDREN : 0

AMT_INCOME_TOTAL : 0

AMT_CREDIT : 0

AMT_ANNUITY : 0

AMT_GOODS_PRICE : 0

REGION_POPULATION_RELATIVE : 0

DAYS_BIRTH : 0

DAYS_EMPLOYED : 0

DAYS_REGISTRATION : 0

DAYS_ID_PUBLISH : 0

FLAG_MOBIL : 0

FLAG_EMP_PHONE : 0

FLAG_WORK_PHONE : 0

FLAG_CONT_MOBILE : 0

FLAG_PHONE : 0

FLAG_EMAIL : 0

CNT_FAM_MEMBERS : 0

REGION_RATING_CLIENT : 0

REGION_RATING_CLIENT_W_CITY : 0

HOUR_APPR_PROCESS_START : 0

REG_REGION_NOT_LIVE_REGION : 0

REG_REGION_NOT_WORK_REGION : 0

LIVE_REGION_NOT_WORK_REGION : 0

REG_CITY_NOT_LIVE_CITY : 0

REG_CITY_NOT_WORK_CITY : 0

LIVE_CITY_NOT_WORK_CITY : 0

EXT_SOURCE_1 : 0

EXT_SOURCE_2 : 0

EXT_SOURCE_3 : 0

APARTMENTS_AVG : 0

BASEMENTAREA_AVG : 0

YEARS_BEGINEXPLUATATION_AVG : 0

ELEVATORS_AVG : 0

ENTRANCES_AVG : 0

FLOORSMAX_AVG : 0

LANDAREA_AVG : 0

LIVINGAREA_AVG : 0

NONLIVINGAREA_AVG : 0

APARTMENTS_MODE : 0

BASEMENTAREA_MODE : 0

YEARS_BEGINEXPLUATATION_MODE : 0

ELEVATORS_MODE : 0

ENTRANCES_MODE : 0

FLOORSMAX_MO

In [12]:
Q1_t = test_data.quantile(0.25)
Q3_t = test_data.quantile(0.75)
IQR_t = Q3_t - Q1_t

### Detecting test_data outliers

In [13]:
test_cols = test_data.dtypes != object
test_cols = test_data.columns[test_cols].tolist()
for i in range(len(train_cols)):
    q1 = Q1_t[test_cols[i]]
    q3 = Q3_t[test_cols[i]]
    iqr = IQR_t[test_cols[i]]
    outliers = test_data[((test_data[train_cols[i]] < (q1 - 1.5 * iqr)) | (test_data[train_cols[i]] > (q3 + 1.5 * iqr)))]
    print("{} : {}\n".format(test_cols[i], outliers.shape[0]))

SK_ID_CURR : 0

CNT_CHILDREN : 1770

AMT_INCOME_TOTAL : 5642

AMT_CREDIT : 2626

AMT_ANNUITY : 3043

AMT_GOODS_PRICE : 5814

REGION_POPULATION_RELATIVE : 3403

DAYS_BIRTH : 0

DAYS_EMPLOYED : 28820

DAYS_REGISTRATION : 287

DAYS_ID_PUBLISH : 0

FLAG_MOBIL : 0

FLAG_EMP_PHONE : 22147

FLAG_WORK_PHONE : 24570

FLAG_CONT_MOBILE : 213

FLAG_PHONE : 0

FLAG_EMAIL : 6992

CNT_FAM_MEMBERS : 1647

REGION_RATING_CLIENT : 32366

REGION_RATING_CLIENT_W_CITY : 31332

HOUR_APPR_PROCESS_START : 911

REG_REGION_NOT_LIVE_REGION : 1857

REG_REGION_NOT_WORK_REGION : 6147

LIVE_REGION_NOT_WORK_REGION : 4906

REG_CITY_NOT_LIVE_CITY : 9504

REG_CITY_NOT_WORK_CITY : 28371

LIVE_CITY_NOT_WORK_CITY : 22141

EXT_SOURCE_1 : 0

EXT_SOURCE_2 : 0

EXT_SOURCE_3 : 0

APARTMENTS_AVG : 4343

BASEMENTAREA_AVG : 2867

YEARS_BEGINEXPLUATATION_AVG : 1887

ELEVATORS_AVG : 4252

ENTRANCES_AVG : 1556

FLOORSMAX_AVG : 2066

LANDAREA_AVG : 2748

LIVINGAREA_AVG : 5074

NONLIVINGAREA_AVG : 6547

APARTMENTS_MODE : 4071

BASEMENTA

### Replacing Test Data outliers

In [14]:
for i in range(len(train_cols)):
    q1 = Q1_t[test_cols[i]]
    q3 = Q3_t[test_cols[i]]
    iqr = IQR_t[test_cols[i]]
    uw = q3 + 1.5 * iqr
    lw = q1 - 1.5 * iqr
    test_data[test_cols[i]] = np.where(test_data[test_cols[i]] > uw, uw, test_data[test_cols[i]])
    test_data[test_cols[i]] = np.where(test_data[test_cols[i]] < lw, lw, test_data[test_cols[i]])
    outliers = test_data[((test_data[test_cols[i]] < (q1 - 1.5 * iqr)) | (test_data[test_cols[i]] > (q3 + 1.5 * iqr)))]
    print("{} : {}\n".format(train_cols[i], outliers.shape[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[test_cols[i]] = np.where(test_data[test_cols[i]] > uw, uw, test_data[test_cols[i]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[test_cols[i]] = np.where(test_data[test_cols[i]] < lw, lw, test_data[test_cols[i]])


SK_ID_CURR : 0

CNT_CHILDREN : 0

AMT_INCOME_TOTAL : 0

AMT_CREDIT : 0

AMT_ANNUITY : 0

AMT_GOODS_PRICE : 0

REGION_POPULATION_RELATIVE : 0

DAYS_BIRTH : 0

DAYS_EMPLOYED : 0

DAYS_REGISTRATION : 0

DAYS_ID_PUBLISH : 0

FLAG_MOBIL : 0

FLAG_EMP_PHONE : 0

FLAG_WORK_PHONE : 0

FLAG_CONT_MOBILE : 0

FLAG_PHONE : 0

FLAG_EMAIL : 0

CNT_FAM_MEMBERS : 0

REGION_RATING_CLIENT : 0

REGION_RATING_CLIENT_W_CITY : 0

HOUR_APPR_PROCESS_START : 0

REG_REGION_NOT_LIVE_REGION : 0

REG_REGION_NOT_WORK_REGION : 0

LIVE_REGION_NOT_WORK_REGION : 0

REG_CITY_NOT_LIVE_CITY : 0

REG_CITY_NOT_WORK_CITY : 0

LIVE_CITY_NOT_WORK_CITY : 0

EXT_SOURCE_1 : 0

EXT_SOURCE_2 : 0

EXT_SOURCE_3 : 0

APARTMENTS_AVG : 0

BASEMENTAREA_AVG : 0

YEARS_BEGINEXPLUATATION_AVG : 0

ELEVATORS_AVG : 0

ENTRANCES_AVG : 0

FLOORSMAX_AVG : 0

LANDAREA_AVG : 0

LIVINGAREA_AVG : 0

NONLIVINGAREA_AVG : 0

APARTMENTS_MODE : 0

BASEMENTAREA_MODE : 0

YEARS_BEGINEXPLUATATION_MODE : 0

ELEVATORS_MODE : 0

ENTRANCES_MODE : 0

FLOORSMAX_MO

## Filling missing values

In [15]:
pd.DataFrame(train_data.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
LANDAREA_AVG,109543
LANDAREA_MODE,109543
LANDAREA_MEDI,109543
BASEMENTAREA_MODE,107975
BASEMENTAREA_AVG,107975
...,...
REGION_POPULATION_RELATIVE,0
NAME_HOUSING_TYPE,0
NAME_FAMILY_STATUS,0
NAME_EDUCATION_TYPE,0


### missing values of numeric columns with mean

In [16]:
for i in range(len(train_cols)):
    mean = train_data[train_cols[i]].mean()
    train_data[train_cols[i]].fillna(value = mean, inplace = True)

In [17]:
pd.DataFrame(test_data.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
LANDAREA_AVG,73047
LANDAREA_MODE,73047
LANDAREA_MEDI,73047
BASEMENTAREA_MODE,71968
BASEMENTAREA_AVG,71968
...,...
REGION_POPULATION_RELATIVE,0
NAME_HOUSING_TYPE,0
NAME_FAMILY_STATUS,0
NAME_EDUCATION_TYPE,0


In [18]:
for i in range(len(test_cols)):
    mean = test_data[test_cols[i]].mean()
    test_data[train_cols[i]].fillna(value = mean, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[train_cols[i]].fillna(value = mean, inplace = True)


### missing values of categorical columns with mode

In [19]:
train_data = train_data.apply(lambda x:x.fillna(x.value_counts().index[0]))
test_data = test_data.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [20]:
pd.DataFrame(train_data.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
SK_ID_CURR,0
NAME_CONTRACT_TYPE,0
DEF_60_CNT_SOCIAL_CIRCLE,0
OBS_60_CNT_SOCIAL_CIRCLE,0
DEF_30_CNT_SOCIAL_CIRCLE,0
...,...
HOUR_APPR_PROCESS_START,0
WEEKDAY_APPR_PROCESS_START,0
REGION_RATING_CLIENT_W_CITY,0
REGION_RATING_CLIENT,0


In [21]:
pd.DataFrame(test_data.isnull().sum().sort_values(ascending = False))

Unnamed: 0,0
SK_ID_CURR,0
NAME_CONTRACT_TYPE,0
DEF_60_CNT_SOCIAL_CIRCLE,0
OBS_60_CNT_SOCIAL_CIRCLE,0
DEF_30_CNT_SOCIAL_CIRCLE,0
...,...
HOUR_APPR_PROCESS_START,0
WEEKDAY_APPR_PROCESS_START,0
REGION_RATING_CLIENT_W_CITY,0
REGION_RATING_CLIENT,0


## Label Encoding of categorical columns

In [22]:
cat_train_cols = train_data.dtypes == object
cat_train_cols = train_data.columns[cat_train_cols].tolist()

encoder = LabelEncoder()
for i in range(len(cat_train_cols)):
    train_data[cat_train_cols[i]] = encoder.fit_transform(train_data[cat_train_cols[i]])

In [23]:
cat_test_cols = test_data.dtypes == object
cat_test_cols = test_data.columns[cat_test_cols].tolist()

encoder = LabelEncoder()
for i in range(len(cat_test_cols)):
    test_data[cat_test_cols[i]] = encoder.fit_transform(test_data[cat_test_cols[i]])

In [24]:
train_data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,450407.0,0,0,0,1,1.0,67500.0,227520.0,11065.5,180000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,271298.0,0,1,1,1,1.0,247500.0,1616625.0,61656.75,1341000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,122238.0,0,1,1,1,1.0,180000.0,101880.0,10827.0,90000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,305311.0,0,1,0,0,0.0,81000.0,405000.0,20677.5,405000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,414121.0,0,0,0,1,0.0,157500.0,888840.0,29506.5,675000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [25]:
test_data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294.0,0,0,0,1,0.0,180000.0,265306.5,25317.0,252000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,439847.0,0,0,0,1,0.0,202500.0,346500.0,21069.0,346500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562.0,0,1,1,0,0.0,337500.0,545040.0,36553.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,407238.0,0,0,0,1,0.0,135000.0,307557.0,20682.0,265500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,239910.0,0,0,0,1,0.0,157500.0,1056447.0,31018.5,922500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.889528


Feature Scaling

In [26]:
ID = test_data['SK_ID_CURR']
col = train_data.columns

In [27]:
train_data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,450407.0,0,0,0,1,1.0,67500.0,227520.0,11065.5,180000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,271298.0,0,1,1,1,1.0,247500.0,1616625.0,61656.75,1341000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,122238.0,0,1,1,1,1.0,180000.0,101880.0,10827.0,90000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,305311.0,0,1,0,0,0.0,81000.0,405000.0,20677.5,405000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,414121.0,0,0,0,1,0.0,157500.0,888840.0,29506.5,675000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [28]:
test_data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,367294.0,0,0,0,1,0.0,180000.0,265306.5,25317.0,252000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,439847.0,0,0,0,1,0.0,202500.0,346500.0,21069.0,346500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,380562.0,0,1,1,0,0.0,337500.0,545040.0,36553.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,407238.0,0,0,0,1,0.0,135000.0,307557.0,20682.0,265500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,239910.0,0,0,0,1,0.0,157500.0,1056447.0,31018.5,922500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.889528


## Applying models

### LGBM

In [29]:
lgb = LGBMClassifier(**{'reg_lambda': 0.1, 
                        'reg_alpha': 0.2, 
                        'num_leaves': 125,  
                        'n_estimators': 300, 
                        'min_child_samples': 800, 
                        'learning_rate': 0.1505,
                        'max_bin': 500,
                        'objective': 'binary',
                        'n_jobs': -1,
                        'class_weight':'balanced',
                        'random_state':100})

lgb.fit(train_data, y_train)
y_pred_train = lgb.predict(train_data)
y_pred_test = lgb.predict(test_data)

## Making output file

In [30]:
ID = ID.astype(int)
ID

0         367294
1         439847
2         380562
3         407238
4         239910
           ...  
123000    128638
123001    169821
123002    442166
123003    301605
123004    176833
Name: SK_ID_CURR, Length: 123005, dtype: int64

In [31]:
output = pd.DataFrame({'SK_ID_CURR' : ID, 'TARGET' : y_pred_test})

In [32]:
output.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,367294,0
1,439847,0
2,380562,0
3,407238,0
4,239910,0


In [33]:
output.to_csv('submission1.csv', index = False)