In [1]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

In [580]:
from sklearn.svm import SVC

In [277]:
# load training and test examples

loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [278]:
# take a sneak peak at some of the examples
loan_train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


In [279]:
# number of training examples
print 'Number of training examples {0} '.format(loan_train.shape[0])

Number of training examples 614 


In [280]:
# number of test examples
print 'Number of test examples {0} '.format(loan_test.shape[0])

Number of test examples 367 


In [281]:
# class distribution
loan_train.Loan_Status.value_counts()

Y    422
N    192
dtype: int64

** Most of the applications for loan were accepted. **

In [573]:
%run scripts/helper.py
%run scripts/model.py

In [380]:
# logistic regression model
log_reg = LogRegression(loan_train, loan_test, 'Loan_Status')

In [381]:
# do the require preprocessing
# fill nan values

log_reg.pre_processing()

In [382]:
# train a logistic regression model
est = log_reg.train_model(loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [383]:
est

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [384]:
# cross validation scores
scores = log_reg.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [385]:
# print min, max and mean of scores
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.670588235294, Maximum 0.717647058824 and Mean 0.694621129928 


In [386]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', log_reg.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  0.648648648649


In [273]:
predictions = log_reg.predict(est, loan_test.columns)

In [276]:
# create submission for logistic regression model
create_submissions(loan_test.index.values, predictions, 'logistic_regression.csv')

## Random Forest Classifier

In [427]:
rf_model = RandomForestModel(loan_train, loan_test, 'Loan_Status')

In [428]:
rf_model.pre_processing()

In [429]:
est = rf_model.train_model(loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [430]:
est

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [431]:
# cross validation scores
scores = rf_model.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [432]:
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.720930232558, Maximum 0.813953488372 and Mean 0.78094564209 


In [433]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', rf_model.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  1.0


In [434]:
predictions = rf_model.predict(est, loan_test.columns)

In [435]:
# create submission for random forest model
create_submissions(loan_test.index.values, predictions, 'random_forest_200_trees.csv')

## Gradient Boosting Classifier

In [576]:
gbm_model = GradientBoostingModel(loan_train, loan_test, 'Loan_Status')

In [577]:
gbm_model.pre_processing()

In [578]:
gbm_model.split_dataset()

In [None]:
gbm_model.feature_selection(SVC(kernel='linear'))

In [540]:
est

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False)

In [541]:
# cross validation scores
scores = gbm_model.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [542]:
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.755813953488, Maximum 0.835294117647 and Mean 0.80194254446 


In [543]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', gbm_model.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  0.837837837838


** Analyze the mispredicted examples **

In [544]:
mispredicted_idx = gbm_model.analyze_mistakes(est, loan_train.columns.drop('Loan_Status'))

In [546]:
mispredicted_examples = gbm_model.X_val[mispredicted_idx]

In [548]:
mispredicted_examples.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001097,2,1,2,0,2,4692,0,106.0,360,1,0,0
LP001179,2,2,3,0,1,4616,0,134.0,360,1,2,0
LP001197,2,2,1,0,1,3366,2200,135.0,360,1,0,0
LP001326,2,1,1,0,0,6782,0,146.412162,360,-999,2,0
LP001421,2,2,1,0,1,5568,2142,175.0,360,1,0,0


In [549]:
# class distribution
mispredicted_examples.Loan_Status.value_counts()

0    28
1     2
dtype: int64

** Most of the times we are accepting a loan application when in fact it should have been rejected. **

In [551]:
# Lets take a look at the loan amount of misclassified examples
mispredicted_examples.LoanAmount.describe()

count     30.000000
mean     134.407883
std       59.754727
min       46.000000
25%       95.250000
50%      132.000000
75%      169.750000
max      275.000000
Name: LoanAmount, dtype: float64

In [553]:
# Overall statistic
gbm_model.X_val.LoanAmount.describe()

count    185.000000
mean     139.413367
std       68.640289
min       17.000000
25%      100.000000
50%      128.000000
75%      160.000000
max      490.000000
Name: LoanAmount, dtype: float64

In [554]:
# Applicant Income for misclassified examples
mispredicted_examples.ApplicantIncome.describe()

count       30.000000
mean      5335.566667
std       5761.700190
min       1916.000000
25%       2714.500000
50%       3958.500000
75%       5405.250000
max      33846.000000
Name: ApplicantIncome, dtype: float64

In [555]:
# Overall statistic
gbm_model.X_val.ApplicantIncome.describe()

count      185.000000
mean      5737.978378
std       8194.369110
min        674.000000
25%       2895.000000
50%       3887.000000
75%       5829.000000
max      81000.000000
Name: ApplicantIncome, dtype: float64

In [562]:
mispredicted_examples.groupby(['Education', 'Loan_Status']).size()

Education  Loan_Status
0          0              21
           1               2
1          0               7
dtype: int64

In [560]:
gbm_model.X_train.Dependents.value_counts()

1    244
3     72
2     70
4     35
0      8
dtype: int64

In [487]:
predictions = gbm_model.predict(est, loan_test.columns)

In [488]:
create_submissions(loan_test.index.values, predictions, 'gbm.csv')

## Exploratory Data Analysis

In [495]:
# Relationship with Gender and Loan Status
loan_train.groupby(['Gender', 'Loan_Status']).size()

Gender  Loan_Status
Female  N               37
        Y               75
Male    N              150
        Y              339
dtype: int64

** 50% of the loan applications for both gender were rejected **

In [498]:
# Relationship between Applicant Income and Loan Status
loan_train.groupby('Loan_Status')['ApplicantIncome'].mean()

Loan_Status
N    5446.078125
Y    5384.068720
Name: ApplicantIncome, dtype: float64

In [499]:
# Relationship between Co-applicant income and Loan Status
loan_train.groupby('Loan_Status')['CoapplicantIncome'].mean()

Loan_Status
N    1877.807292
Y    1504.516398
Name: CoapplicantIncome, dtype: float64