# Author: Tanay Yadav 
# Roll No: AI20BTECH11026
# Assignment 3 Q5

In [57]:
# importing required libraries

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier 

In [31]:
# assigning respective datasets

train_set = pd.read_csv('loan_train.csv')
test_set = pd.read_csv('loan_test.csv')

## Processing the Training Dataset

In [32]:
train_set.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Data columns (total 111 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              24999 non-null  int64  
 1   member_id                       24999 non-null  int64  
 2   loan_amnt                       24999 non-null  int64  
 3   funded_amnt                     24999 non-null  int64  
 4   funded_amnt_inv                 24999 non-null  float64
 5   term                            24999 non-null  object 
 6   int_rate                        24999 non-null  object 
 7   installment                     24999 non-null  float64
 8   grade                           24999 non-null  object 
 9   sub_grade                       24999 non-null  object 
 10  emp_title                       24993 non-null  object 
 11  emp_length                      24958 non-null  object 
 12  home_ownership                 

In [33]:
# moving the target variable to the last column of the dataframe
train_set = train_set[[c for c in train_set if c not in ['loan_status']] + ['loan_status']]

# dropping the columns having all the values as NAN
train_set = train_set.dropna(axis=1, how='all')

In [34]:
# dropping the columns having all values as the same
nunique = train_set.nunique()
cols_to_drop = nunique[nunique == 1].index
train_set = train_set.drop(cols_to_drop, axis=1)

# dropping the useless columns
train_set = train_set.drop(columns=['next_pymnt_d','id','member_id','total_rec_late_fee', 'collection_recovery_fee','earliest_cr_line','last_pymnt_d','sub_grade'])
train_set = train_set.drop(columns=['last_pymnt_amnt','url','emp_title','out_prncp','out_prncp_inv','funded_amnt','zip_code','addr_state','pub_rec_bankruptcies'])
train_set = train_set.drop(columns=['funded_amnt_inv','mths_since_last_delinq','mths_since_last_record','desc','last_credit_pull_d','verification_status','term'])
train_set = train_set.drop(columns=['purpose','issue_d','title','delinq_2yrs','total_pymnt_inv','open_acc', 'pub_rec', 'revol_bal','revol_util','total_acc','grade'])

In [35]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loan_amnt        24999 non-null  int64  
 1   int_rate         24999 non-null  object 
 2   installment      24999 non-null  float64
 3   emp_length       24958 non-null  object 
 4   home_ownership   24999 non-null  object 
 5   annual_inc       24999 non-null  float64
 6   dti              24999 non-null  float64
 7   inq_last_6mths   24999 non-null  int64  
 8   total_pymnt      24999 non-null  float64
 9   total_rec_prncp  24999 non-null  float64
 10  total_rec_int    24999 non-null  float64
 11  recoveries       24999 non-null  float64
 12  loan_status      24999 non-null  object 
dtypes: float64(7), int64(2), object(4)
memory usage: 2.5+ MB


In [36]:
# ignoring the datapoints corresponding to 'Current' target variable as it is of no use for this problem
train_set = train_set[train_set.loan_status != 'Current']
# train_set

In [37]:
# filling the NAN values with the modal value for emp_length
freq = train_set.emp_length.dropna().mode()
print(freq,'\n')

0    10+ years
dtype: object 



In [38]:
# filling the NAN values with the modal value 
train_set['emp_length']=train_set['emp_length'].fillna('10+ years')

# converting the strings to float 
train_set['emp_length'].replace('10+ years', '10 years',inplace=True)
train_set['emp_length'].replace('< 1 year', '1 years',inplace=True)
train_set['emp_length'].replace('1 year', '1 years',inplace=True)
train_set['emp_length'] = train_set['emp_length'].str.rstrip(' years').astype('float')

# converting strings to float
train_set['home_ownership'].replace('RENT', 0,inplace=True)
train_set['home_ownership'].replace('OWN', 1,inplace=True)
train_set['home_ownership'].replace('MORTGAGE', 2,inplace=True)
train_set['home_ownership'].replace('OTHER', 3,inplace=True)

train_set['loan_status'].replace('Charged Off', -1, inplace=True)
train_set['loan_status'].replace('Fully Paid', 1, inplace=True)

# removing the % sign from the percentages and replacing the strings as float
train_set['int_rate'] = train_set['int_rate'].str.rstrip('%').astype('float') / 100.0
# train_set

In [39]:
# assigning the target variable to train_labels
train_labels = train_set['loan_status']
train_set = train_set.drop(['loan_status'],axis=1)
# train_labels

## Processing the Test Dataset
Everything is processed similar to the Training Dataset

In [40]:
test_set = test_set[[c for c in test_set if c not in ['loan_status']] + ['loan_status']]
test_set = test_set.dropna(axis=1, how='all')
test_set.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14718 entries, 0 to 14717
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          14718 non-null  int64  
 1   member_id                   14718 non-null  int64  
 2   loan_amnt                   14718 non-null  int64  
 3   funded_amnt                 14718 non-null  int64  
 4   funded_amnt_inv             14718 non-null  float64
 5   term                        14718 non-null  object 
 6   int_rate                    14718 non-null  object 
 7   installment                 14718 non-null  float64
 8   grade                       14718 non-null  object 
 9   sub_grade                   14718 non-null  object 
 10  emp_title                   12265 non-null  object 
 11  emp_length                  13684 non-null  object 
 12  home_ownership              14718 non-null  object 
 13  annual_inc                  147

In [41]:
nunique = test_set.nunique()
cols_to_drop = nunique[nunique == 1].index
test_set = test_set.drop(cols_to_drop, axis=1)

test_set = test_set.drop(columns=['next_pymnt_d','id','member_id','total_rec_late_fee', 'collection_recovery_fee','earliest_cr_line','last_pymnt_d','sub_grade'])
test_set = test_set.drop(columns=['last_pymnt_amnt','url','emp_title','out_prncp','out_prncp_inv','funded_amnt','zip_code','addr_state','pub_rec_bankruptcies'])
test_set = test_set.drop(columns=['funded_amnt_inv','mths_since_last_delinq','mths_since_last_record','desc','last_credit_pull_d','verification_status','term'])
test_set = test_set.drop(columns=['purpose','issue_d','title','delinq_2yrs','total_pymnt_inv','open_acc', 'pub_rec', 'revol_bal','revol_util','total_acc','grade'])
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14718 entries, 0 to 14717
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loan_amnt        14718 non-null  int64  
 1   int_rate         14718 non-null  object 
 2   installment      14718 non-null  float64
 3   emp_length       13684 non-null  object 
 4   home_ownership   14718 non-null  object 
 5   annual_inc       14718 non-null  float64
 6   dti              14718 non-null  float64
 7   inq_last_6mths   14718 non-null  int64  
 8   total_pymnt      14718 non-null  float64
 9   total_rec_prncp  14718 non-null  float64
 10  total_rec_int    14718 non-null  float64
 11  recoveries       14718 non-null  float64
 12  loan_status      14718 non-null  object 
dtypes: float64(7), int64(2), object(4)
memory usage: 1.5+ MB


In [42]:
test_set = test_set[test_set.loan_status != 'Current']
# test_set = test_set[test_set.home_ownership != 'NONE']
# test_set

In [43]:
freq_t = test_set.emp_length.dropna().mode()
print(freq_t,'\n')

0    10+ years
dtype: object 



In [44]:
test_set['emp_length']=test_set['emp_length'].fillna('10+ years')
test_set['emp_length'].replace('10+ years', '10 years',inplace=True)
test_set['emp_length'].replace('< 1 year', '1 years',inplace=True)
test_set['emp_length'].replace('1 year', '1 years',inplace=True)
test_set['emp_length'] = test_set['emp_length'].str.rstrip(' years').astype('float')

test_set['home_ownership'].replace('RENT', 0,inplace=True)
test_set['home_ownership'].replace('OWN', 1,inplace=True)
test_set['home_ownership'].replace('MORTGAGE', 2,inplace=True)
test_set['home_ownership'].replace('OTHER', 3,inplace=True)
test_set['home_ownership'].replace('NONE', 4,inplace=True)

test_set['loan_status'].replace('Charged Off', -1, inplace=True)
test_set['loan_status'].replace('Fully Paid', 1, inplace=True)


test_set['int_rate'] = test_set['int_rate'].str.rstrip('%').astype('float') / 100.0
# test_set

In [45]:
test_labels = test_set['loan_status']
test_set = test_set.drop(['loan_status'],axis=1)


## Importing the GradientBoostingClassifier from sklearn.ensemble

In [46]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0)

In [47]:
# fitting the classifier with the training data
clf.fit(train_set, train_labels)

GradientBoostingClassifier(learning_rate=1.0, random_state=0)

In [48]:
y_train = clf.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_train, train_labels))

Generalisation Error = 0.0014814205176741924


In [49]:
y_test = clf.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_test, test_labels))

Validation Error = 0.004483048472961659


In [50]:
recall = recall_score(y_test, test_labels, average='binary')
print('Recall =', recall)

Recall = 0.9964594483326472


In [51]:
precision = precision_score(y_test, test_labels, average='binary')
print('Precision =', precision)

Precision = 0.9982677555060628


In [62]:
# number of trees = 30
print('For 30 trees in the classifier')
clf = GradientBoostingClassifier(n_estimators=30, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(train_set, train_labels)
y_train = clf.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_train, train_labels))
y_test = clf.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_test, test_labels))
recall = recall_score(y_test, test_labels, average='binary')
print('Recall =', recall)
precision = precision_score(y_test, test_labels, average='binary')
print('Precision =', precision)

For 30 trees in the classifier
Generalisation Error = 0.022303608904983285
Validation Error = 0.023816195012608565
Recall = 0.9732508635231746
Precision = 0.9994225851686876


In [53]:
# number of trees = 60
print('For 60 trees in the classifier')
clf = GradientBoostingClassifier(n_estimators=60, learning_rate=1.0, max_depth=3, random_state=0)
clf.fit(train_set, train_labels)
y_train = clf.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_train, train_labels))
y_test = clf.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_test, test_labels))
recall = recall_score(y_test, test_labels, average='binary')
print('Recall =', recall)
precision = precision_score(y_test, test_labels, average='binary')
print('Precision =', precision)

For 60 trees in the classifier
Generalisation Error = 0.0014814205176741924
Validation Error = 0.004483048472961659
Recall = 0.9964594483326472
Precision = 0.9982677555060628


In [54]:
# number of trees = 80
print('For 80 trees in the classifier')
clf = GradientBoostingClassifier(n_estimators=80, learning_rate=1.0, max_depth=3, random_state=0)
clf.fit(train_set, train_labels)
y_train = clf.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_train, train_labels))
y_test = clf.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_test, test_labels))
recall = recall_score(y_test, test_labels, average='binary')
print('Recall =', recall)
precision = precision_score(y_test, test_labels, average='binary')
print('Precision =', precision)

For 80 trees in the classifier
Generalisation Error = 0.0014814205176741924
Validation Error = 0.004483048472961659
Recall = 0.9964594483326472
Precision = 0.9982677555060628


In [55]:
# number of trees = 100
print('For 100 trees in the classifier')
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0)
clf.fit(train_set, train_labels)
y_train = clf.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_train, train_labels))
y_test = clf.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_test, test_labels))
recall = recall_score(y_test, test_labels, average='binary')
print('Recall =', recall)
precision = precision_score(y_test, test_labels, average='binary')
print('Precision =', precision)

For 100 trees in the classifier
Generalisation Error = 0.0014814205176741924
Validation Error = 0.004483048472961659
Recall = 0.9964594483326472
Precision = 0.9982677555060628


The Classifier with 100 trees provides the lowest training and testing errors along with the best recall and precision over the test dataset.

In [61]:
clf_tree = DecisionTreeClassifier(max_depth=7)
print('For a single Decision Tree:')
clf_tree.fit(train_set, train_labels)
y_tree_train = clf_tree.predict(train_set)
print('Generalisation Error =', 1-accuracy_score(y_tree_train, train_labels))

y_tree_test = clf_tree.predict(test_set)
print('Validation Error =', 1-accuracy_score(y_tree_test, test_labels))

For a single Decision Tree:
Generalisation Error = 0.003827003003991636
Validation Error = 0.004763239002521735


Hence,   
For a single Decision Tree:  
Generalisation Error = 0.003827003003991636  
Validation Error = 0.004763239002521735  

and,  
For 100 trees in the classifier   
Generalisation Error = 0.0014814205176741924   
Validation Error = 0.004483048472961659    
Recall = 0.9964594483326472   
Precision = 0.9982677555060628   
  
Hence, the GradientBoostingClassifier provides better accuracy over the training and the testing datasets