In [56]:
# Load Libaries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
# Load Training Data
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [58]:
train.shape

(614, 13)

In [59]:
print (test.shape)
test.head()

(367, 12)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [60]:
train.index = train['Loan_ID']
test.index = test['Loan_ID']
train = train[train.columns.drop('Loan_ID')]
test = test[test.columns.drop('Loan_ID')]

In [61]:
test.index

Index(['LP001015', 'LP001022', 'LP001031', 'LP001035', 'LP001051', 'LP001054',
       'LP001055', 'LP001056', 'LP001059', 'LP001067',
       ...
       'LP002952', 'LP002954', 'LP002962', 'LP002965', 'LP002969', 'LP002971',
       'LP002975', 'LP002980', 'LP002986', 'LP002989'],
      dtype='object', name='Loan_ID', length=367)

In [62]:
train.index

Index(['LP001002', 'LP001003', 'LP001005', 'LP001006', 'LP001008', 'LP001011',
       'LP001013', 'LP001014', 'LP001018', 'LP001020',
       ...
       'LP002959', 'LP002960', 'LP002961', 'LP002964', 'LP002974', 'LP002978',
       'LP002979', 'LP002983', 'LP002984', 'LP002990'],
      dtype='object', name='Loan_ID', length=614)

# Categorical Variables

In [63]:
train['Credit_History'] = train['Credit_History'].astype('object');
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History        object
Property_Area         object
Loan_Status           object
dtype: object

In [64]:
# CAtegorical Variables
categorical_vars = train.dtypes.loc[train.dtypes == 'object'].index
categorical_vars

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

# Continous Variables

In [65]:
continous_vars = train.dtypes.loc[train.dtypes != 'object'].index
continous_vars

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term'],
      dtype='object')

In [66]:
train[continous_vars].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
count,614.0,614.0,592.0,600.0
mean,5403.459283,1621.245798,146.412162,342.0
std,6109.041673,2926.248369,85.587325,65.12041
min,150.0,0.0,9.0,12.0
25%,2877.5,0.0,100.0,360.0
50%,3812.5,1188.5,128.0,360.0
75%,5795.0,2297.25,168.0,360.0
max,81000.0,41667.0,700.0,480.0


# Handling Missing Values

In [67]:
# Finding Missing values
np.sum(pd.isnull(train))

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Computing Mode

In [68]:
from scipy.stats import mode
mode(train['Gender'].astype(str)).mode[0]



'Male'

### Imputing categorical variables with mode values

In [69]:
#Impute values 
for var in categorical_vars[np.sum(pd.isnull(train[categorical_vars])).values != 0]:
    train[var].fillna(mode(train[var].astype(str)).mode[0], inplace = True)
    test[var].fillna(mode(test[var].astype(str)).mode[0], inplace = True)



In [70]:
# Cheking Missing Values
np.sum(pd.isnull(train[categorical_vars]))

Gender            0
Married           0
Dependents        0
Education         0
Self_Employed     0
Credit_History    0
Property_Area     0
Loan_Status       0
dtype: int64

In [71]:
np.sum(pd.isnull(test[categorical_vars[:-1]]))

Gender            0
Married           0
Dependents        0
Education         0
Self_Employed     0
Credit_History    0
Property_Area     0
dtype: int64

### Imputing Continous Variables with median Values

In [72]:
np.sum(pd.isnull(train[continous_vars]))

ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
dtype: int64

In [73]:
# values to treat
continous_vars[np.sum(pd.isnull(train[continous_vars])).values != 0]

Index(['LoanAmount', 'Loan_Amount_Term'], dtype='object')

In [74]:
for var in continous_vars[np.sum(pd.isnull(train[continous_vars])).values != 0]:
    train[var].fillna(np.median(train[var].dropna()), inplace = True)
    test[var].fillna(np.median(test[var].dropna()), inplace = True)    

In [75]:
# Checking values
np.sum(pd.isnull(train[continous_vars]))

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
dtype: int64

In [76]:
np.sum(pd.isnull(test[continous_vars]))

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
dtype: int64

# CATBOOST

In [77]:
train.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History        object
Property_Area         object
Loan_Status           object
dtype: object

In [78]:
categorical_features_indices = np.where(train.dtypes != np.float)[0]
categorical_features_indices[:-1]

array([ 0,  1,  2,  3,  4,  5,  9, 10], dtype=int64)

## Model Building

In [79]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Loan_Status'] = le.fit_transform(train['Loan_Status'].astype(str))
train['Loan_Status'].head()
le.classes_

array(['N', 'Y'], dtype=object)

In [2]:
from sklearn.model_selection import train_test_split
y = train_ind['Loan_Status']
X = train.loc[:,train.columns != 'Loan_Status']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)

NameError: name 'train' is not defined

In [1]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=500, 
                         learning_rate=0.01, 
                         depth=6, 
                         l2_leaf_reg=3, 
                         rsm=1, 
                         loss_function='Logloss')

model.fit(X_train, y_train, cat_features=categorical_features_indices[:-1], eval_set=(X_validation, y_validation), plot=True)

NameError: name 'X_train' is not defined

In [82]:
y_pred = model.predict(X_validation)
from sklearn.metrics import accuracy_score
accuracy_score(y_validation, y_pred)

0.83243243243243248

In [83]:
y_pred

array([ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,
        1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1

In [86]:
y_pred_inverse = le.inverse_transform(y_pred.astype(int))
y_pred_inverse

array(['Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

## Entire Training Set

In [87]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=500, 
                         learning_rate=0.01, 
                         depth=6, 
                         l2_leaf_reg=3, 
                         rsm=1, 
                         loss_function='Logloss')

model.fit(X, y, cat_features=categorical_features_indices[:-1])

<catboost.core.CatBoostClassifier at 0x2d2f7b46d68>

In [88]:
y_pred = model.predict(test)
y_pred_inverse = le.inverse_transform(y_pred.astype(int))
y_pred_inverse

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [89]:
ss = pd.read_csv('Sample_Submission.csv')
ss['Loan_ID'] = test.index
ss['Loan_Status'] = y_pred_inverse
ss.to_csv('mysubmission_catboost.csv', sep=',', index= False)