In [27]:
import pandas as pd
import scipy as sc
import numpy as np
from scipy.stats import mode
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style='darkgrid')

In [71]:
trainDF = pd.read_csv('train.csv')
testDF = pd.read_csv('test.csv')

In [5]:
trainDF.shape , testDF.shape

((614, 13), (367, 12))

In [8]:
trainDF.count()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

In [9]:
testDF.count()

Loan_ID              367
Gender               356
Married              367
Dependents           357
Education            367
Self_Employed        344
ApplicantIncome      367
CoapplicantIncome    367
LoanAmount           362
Loan_Amount_Term     361
Credit_History       338
Property_Area        367
dtype: int64

In [72]:
trainDF['source']='train'
testDF['source']='test'
allDF = pd.concat([trainDF, testDF],ignore_index=True)
trainDF.shape, testDF.shape, allDF.shape

((614, 14), (367, 13), (981, 14))

In [88]:
allDF.apply(lambda x: sum(x.isnull()))

ApplicantIncome        0
CoapplicantIncome      0
Credit_History         0
Dependents             0
Education              0
Gender                 0
LoanAmount             0
Loan_Amount_Term       0
Loan_ID                0
Loan_Status          367
Married                0
Property_Area          0
Self_Employed          0
source                 0
dtype: int64

In [36]:
allDF.dtypes

ApplicantIncome        int64
CoapplicantIncome    float64
Credit_History       float64
Dependents            object
Education             object
Gender                object
LoanAmount           float64
Loan_Amount_Term     float64
Loan_ID               object
Loan_Status           object
Married               object
Property_Area         object
Self_Employed         object
source                object
dtype: object

In [92]:
allDF.apply(lambda x: len(x.unique()))

ApplicantIncome      752
CoapplicantIncome    437
Credit_History         2
Dependents             4
Education              2
Gender                 2
LoanAmount           233
Loan_Amount_Term      12
Loan_ID              981
Loan_Status            3
Married                2
Property_Area          3
Self_Employed          2
source                 2
dtype: int64

In [14]:
allDF.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,LoanAmount,Loan_Amount_Term
count,981.0,981.0,902.0,954.0,961.0
mean,5179.795107,1601.91633,0.83592,142.51153,342.201873
std,5695.104533,2718.772806,0.370553,77.421743,65.100602
min,0.0,0.0,0.0,9.0,6.0
25%,2875.0,0.0,1.0,100.0,360.0
50%,3800.0,1110.0,1.0,126.0,360.0
75%,5516.0,2365.0,1.0,162.0,360.0
max,81000.0,41667.0,1.0,700.0,480.0


In [18]:
#Get a boolean variable specifying missing Item_Weight values
miss_data = allDF['Credit_History'].isnull() 

#Impute data and check #missing values before and after imputation to confirm
#print 'Orignal #missing: %d'% sum(miss_bool)
allDF.loc[miss_data,'Credit_History'] = 1.000000
#print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())

In [74]:
allDF['Gender'] = allDF['Gender'].fillna( allDF['Gender'].dropna().mode().values[0] )
allDF['Married'] = allDF['Married'].fillna( allDF['Married'].dropna().mode().values[0] )
allDF['Dependents'] = allDF['Dependents'].fillna( allDF['Dependents'].dropna().mode().values[0] )
allDF['Self_Employed'] = allDF['Self_Employed'].fillna( allDF['Self_Employed'].dropna().mode().values[0] )
allDF['LoanAmount'] = allDF['LoanAmount'].fillna( allDF['LoanAmount'].dropna().mean() )
allDF['Loan_Amount_Term'] = allDF['Loan_Amount_Term'].fillna( allDF['Loan_Amount_Term'].dropna().mode().values[0] )
allDF['Credit_History'] = allDF['Credit_History'].fillna( allDF['Credit_History'].dropna().mode().values[0] )
allDF['Dependents'] = allDF['Dependents'].str.rstrip('+')

In [86]:
allDF['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [91]:
allDF['Gender'] = allDF['Gender'].map({'Female':0,'Male':1}).astype(np.int)
allDF['Married'] = allDF['Married'].map({'No':0, 'Yes':1}).astype(np.int)
allDF['Education'] = allDF['Education'].map({'Not Graduate':0, 'Graduate':1}).astype(np.int)
allDF['Self_Employed'] = allDF['Self_Employed'].map({'No':0, 'Yes':1}).astype(np.int)
allDF['Dependents'] = allDF['Dependents'].astype(np.int)

ValueError: Cannot convert NA to integer

In [93]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#New variable for outlet

var_mod = ['Dependents','Education','Gender', 'Married' , 'Property_Area' , 'Self_Employed']
le = LabelEncoder()
for i in var_mod:
    allDF[i] = le.fit_transform(allDF[i])

In [94]:
allDF_Dummy = pd.get_dummies(allDF, columns=['Dependents','Education','Gender','Married' ,'Property_Area' , 'Self_Employed'])

In [95]:
allDF['Self_Employed'].value_counts()

0    862
1    119
Name: Self_Employed, dtype: int64

In [123]:
train_data = allDF.loc[allDF['source']=="train"]
test_data = allDF.loc[allDF['source']=="test"]

In [124]:
train_data.shape , test_data.shape

((614, 14), (367, 14))

In [125]:
train_data['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [119]:
train_data = pd.get_dummies(train_data, columns=['Loan_Status'])

In [130]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt 

def modelfitClassification(alg, dtrain, dvalidation , dtest, predictors, target, IDcol, filename):
    #Fit the algorithm on the data
    
    print('predictors '+str(predictors))
        
        
    alg.fit(dtrain[predictors], dtrain[target])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])

    #Perform cross-validation:
    cv_score = cross_val_score(alg, dtrain[predictors], dtrain[target], cv=5, scoring='accuracy')
    cv_score = np.sqrt(np.abs( cv_score))
    
    #Print model report:
    print ("\nModel Report")
    print ("RMSE : %.4g" % np.sqrt(metrics.accuracy_score(dtrain[target].values, dtrain_predictions)))
    print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))

    #Model Accuracy on validation data:
    
    dvalidation_predictions = alg.predict(dvalidation[predictors])
    print ("\nValidation Model Report")
    print ("RMSE : %.4g" % np.sqrt(metrics.accuracy_score(dvalidation[target].values, dvalidation_predictions)))
    
    
    #Predict on testing data:
    
    dtest_predictions = alg.predict(dtest[predictors])
    dtest['Loan_Status'] = dtest_predictions
    #Export submission file:
    IDcol.append("Loan_Status")
    submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
    submission.to_csv(filename, index=False)

In [126]:
from sklearn.model_selection import train_test_split
train_data, validation_data = train_test_split(train_data, test_size = 0.2)

In [127]:
train_data.shape , validation_data.shape

((491, 14), (123, 14))

In [131]:
from sklearn.linear_model import LogisticRegression

target = 'Loan_Status'
IDcol = ['Loan_ID']
removeColumn = ['source']
predictors = [x for x in train_data.columns if x not in [target]+IDcol+removeColumn]
# print predictors
alg1 = LogisticRegression()
modelfitClassification(alg1, train_data, validation_data , test_data, predictors, target, IDcol, 'linearLoan.csv')

predictors ['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']

Model Report
RMSE : 0.9037
CV Score : Mean - 0.9025 | Std - 0.01188 | Min - 0.8864 | Max - 0.9203

Validation Model Report
RMSE : 0.8926


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
from sklearn.linear_model import LogisticRegression

target = 'Loan_Status'
IDcol = ['Loan_ID']
removeColumn = ['source']
predictors = [x for x in train_data.columns if x not in [target]+IDcol+removeColumn]
# print predictors
alg1 = LogisticRegression()
modelfitClassification(alg1, train_data, validation_data , test_data, predictors, target, IDcol, 'linearLoan.csv')

In [141]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt 
from sklearn.grid_search import GridSearchCV

def modelfitGridSearchClassification(alg, dtrain, dvalidation , dtest, predictors, target, IDcol, filename, tuned_parameters):
    #Fit the algorithm on the data
    
    print('predictors '+str(predictors))
        
        
    clf = GridSearchCV(alg, tuned_parameters, cv=5, scoring='accuracy')
    
    clf.fit(dtrain[predictors], dtrain[target])
    
    print("Best parameters set found on development set:")
    print(clf.best_estimator_)
    
    print("Best Score"+str(clf.best_score_))
    
        
    #Predict training set:
    dtrain_predictions = clf.predict(dtrain[predictors])

    #Print model report:
    print ("\nModel Report")
    print ("RMSE : %.4g" % np.sqrt(metrics.accuracy_score(dtrain[target].values, dtrain_predictions)))
    

    #Model Accuracy on validation data:
    
    dvalidation_predictions = clf.predict(dvalidation[predictors])
    print ("\nValidation Model Report")
    print ("RMSE : %.4g" % np.sqrt(metrics.accuracy_score(dvalidation[target].values, dvalidation_predictions)))
    
    
    #Predict on testing data:
    
    dtest_predictions = clf.predict(dtest[predictors])
    dtest['Loan_Status'] = dtest_predictions
    #Export submission file:
    IDcol.append("Loan_Status")
    submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
    submission.to_csv(filename, index=False)

In [153]:
from sklearn.tree import DecisionTreeClassifier

target = 'Loan_Status'
IDcol = ['Loan_ID']
removeColumn = ['source']
predictors = [x for x in train_data.columns if x not in [target]+IDcol+removeColumn]
# print predictors

param_grid = {"criterion": ["entropy"],
              "min_samples_split": [10 , 20 ,50 , 100 , 150],
              "max_depth": [ 5 ,10 , 15],
              "min_samples_leaf": [ 2 ,4 , 5],
              "max_leaf_nodes": [ 5, 10 , 20 ,25],
             # "max_features" :['auto']
              }

alg3 = DecisionTreeClassifier()
modelfitGridSearchClassification(alg3, train_data, validation_data , test_data, predictors, target, IDcol, 'Decision.csv' , param_grid)


predictors ['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']
Best parameters set found on development set:
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=5, min_impurity_split=1e-07,
            min_samples_leaf=4, min_samples_split=150,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best Score0.7983706720977597

Model Report
RMSE : 0.9048

Validation Model Report
RMSE : 0.8926


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [189]:
from sklearn.ensemble import RandomForestClassifier
target = 'Loan_Status'
IDcol = ['Loan_ID']
removeColumn = ['source']
predictors = [x for x in train_data.columns if x not in [target]+IDcol+removeColumn]
# print predictors

param_grid = {"criterion": ["entropy"],
              "min_samples_split": [2 , 5 , 10,50]
              #"max_depth": [2, 5 ,10 , 15],
              #"min_samples_leaf": [ 2 ,4 , 5],
              #"max_leaf_nodes": [ 5, 10 , 20 ,25],
              #"max_features" :['auto' , 3],
              #"n_estimators":[10,50,100]
              }

alg3 = RandomForestClassifier()
modelfitGridSearchClassification(alg3, train_data, validation_data , test_data, predictors, target, IDcol, 'RandomForest.csv' , param_grid)


predictors ['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']
Best parameters set found on development set:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=50, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Best Score0.8105906313645621

Model Report
RMSE : 0.9048

Validation Model Report
RMSE : 0.888


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [183]:
from sklearn.ensemble import AdaBoostClassifier
target = 'Loan_Status'
IDcol = ['Loan_ID']
removeColumn = ['source']
predictors = [x for x in train_data.columns if x not in [target]+IDcol+removeColumn]
# print predictors

param_grid = {"n_estimators": [1],
              #"base_estimator": ['tree' ],
              #"learning_rate": [0.2],
              #"min_samples_leaf": [ 2 ,4 , 5],
              #"max_leaf_nodes": [ 5, 10 , 20 ,25],
              #"max_features" :['auto' , 3]
              "random_state":[6]
              }

alg3 = AdaBoostClassifier()
modelfitGridSearchClassification(alg3, train_data, validation_data , test_data, predictors, target, IDcol, 'AdaBoost.csv' , param_grid)

predictors ['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']
Best parameters set found on development set:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=1, random_state=6)
Best Score0.8126272912423625

Model Report
RMSE : 0.9015

Validation Model Report
RMSE : 0.8926


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
