In [34]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None) 
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from boruta import BorutaPy

In [55]:
train = pd.read_csv("441D1train.csv") 
test = pd.read_csv("441D1test.csv") 

In [56]:
# dropping the Id column
train = train.drop('Id',axis=1) 
test = test.drop('Id',axis=1) 

In [4]:
X_train = train.iloc[:,0:8] 

In [5]:
Y_train = train['Default'] 

In [None]:
# selecting features using Boruta
feat_selector = BorutaPy(rf1, n_estimators='auto', verbose=2, random_state=1)
feat_selector.fit(X_train_val, Y_train_val)
feat_selector.support_ 
# Hence we choose 5 columns (Age, Experience, Car_Ownership, CURRENT_JOB_YRS, CURRENT_HOUSE_YRS) 

In [6]:
# splitting train data 
train_features, test_features, train_labels, test_labels = train_test_split(X_train, Y_train, test_size= 0.25, random_state=42)

In [7]:
# Selecting only 5 columns (Age, Experience, Car_Ownership, CURRENT_JOB_YRS, CURRENT_HOUSE_YRS) 
train_1 = train_features[['Age','Experience','Car_Ownership','CURRENT_JOB_YRS','CURRENT_HOUSE_YRS']]

In [8]:
test_1 = test_features[['Age','Experience','Car_Ownership','CURRENT_JOB_YRS','CURRENT_HOUSE_YRS']]

In [9]:
# Standard scaling of data
sc = StandardScaler()
train_1 = sc.fit_transform(train_1)
test_1 = sc.transform(test_1)

In [10]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 0)
lr_clf.fit(train_1, train_labels)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
predictions_lr = lr_clf.predict(test_1) 
error_lr = abs(predictions_lr-test_labels)
print('Mean Absolute Error:', round(np.mean(error_lr), 2), 'degrees.')

Mean Absolute Error: 0.01 degrees.


In [12]:
print('Accuracy on training set:',lr_clf.score(train_1,train_labels))
print('Accuracy score:', accuracy_score(test_labels, predictions_lr)) 
print('Recall score:', recall_score(test_labels, predictions_lr))
print('Precision score:',precision_score(test_labels, predictions_lr)) 

Accuracy on training set: 0.9919
Accuracy score: 0.9911
Recall score: 0.9910633597750779
Precision score: 0.9910633597750779


In [13]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_clf= GaussianNB()
nb_clf.fit(train_1, train_labels)
predictions_nb = nb_clf.predict(test_1) 

In [14]:
error_nb = abs(predictions_nb-test_labels)
print('Mean Absolute Error:', round(np.mean(error_nb), 2), 'degrees.')
print('Accuracy on training set:',nb_clf.score(train_1,train_labels))
print('Accuracy score:', accuracy_score(test_labels, predictions_nb)) 
print('Recall score:', recall_score(test_labels, predictions_nb))
print('Precision score:',precision_score(test_labels, predictions_nb))  

Mean Absolute Error: 0.06 degrees.
Accuracy on training set: 0.9388166666666666
Accuracy score: 0.93925
Recall score: 0.9450748067075008
Precision score: 0.9337301587301587


In [15]:
# Random Forest
rf_clf = RandomForestClassifier(n_estimators = 1000, random_state = 0) 
rf_clf.fit(train_1, train_labels) 
predictions_rf = rf_clf.predict(test_1) 

In [16]:
error_rf = abs(predictions_rf-test_labels)
print('Mean Absolute Error:', round(np.mean(error_rf), 2), 'degrees.')
print('Accuracy on training set:',rf_clf.score(train_1,train_labels))
print('Accuracy score:', accuracy_score(test_labels, predictions_rf)) 
print('Recall score:', recall_score(test_labels, predictions_rf))
print('Precision score:',precision_score(test_labels, predictions_rf))   

Mean Absolute Error: 0.02 degrees.
Accuracy on training set: 0.9991333333333333
Accuracy score: 0.98425
Recall score: 0.9844361883723265
Precision score: 0.9839421918908069


In [17]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier()
dec_clf.fit(train_1, train_labels)
predictions_dec = dec_clf.predict(test_1) 

In [18]:
error_dec = abs(predictions_dec-test_labels)
print('Mean Absolute Error:', round(np.mean(error_dec), 2), 'degrees.')
print('Accuracy on training set:',dec_clf.score(train_1,train_labels))
print('Accuracy score:', accuracy_score(test_labels, predictions_dec)) 
print('Recall score:', recall_score(test_labels, predictions_dec))
print('Precision score:',precision_score(test_labels, predictions_dec))   

Mean Absolute Error: 0.02 degrees.
Accuracy on training set: 0.9991333333333333
Accuracy score: 0.9791
Recall score: 0.9791143689125414
Precision score: 0.9789177793394237


In [19]:
# Ensemble of SVM and Logistic Regression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
 estimators=[('lr', lr_clf), ('nb', nb_clf), ('decision', dec_clf)],
 voting='hard')
voting_clf.fit(train_1,train_labels)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('nb',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('decision',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None

In [20]:
print("Accuracy on training set",voting_clf.score(train_1, train_labels))
print("Accuracy score",voting_clf.score(test_1, test_labels)) 

0.996
0.98735


In [21]:
# Ensemble of Logistic rgression and Decision tree
voting_clf2 = VotingClassifier(
 estimators=[('lr', lr_clf),('decision', dec_clf)],
 voting='hard')
voting_clf2.fit(train_1, train_labels) 

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('decision',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                        

In [22]:
pred_clf2 = voting_clf2.predict(test_1)
error_clf2 = abs(pred_clf2-test_labels)
print('Mean Absolute Error:', round(np.mean(error_clf2), 2), 'degrees.')
print('Accuracy on training set:',voting_clf2.score(train_1, train_labels))
print('Accuracy score:', accuracy_score(test_labels, pred_clf2)) 
print('Recall score:', recall_score(test_labels, pred_clf2))
print('Precision score:',precision_score(test_labels, pred_clf2)) 

Mean Absolute Error: 0.01 degrees.
Accuracy on training set: 0.99575
Accuracy score: 0.9851
Recall score: 0.9719851390701878
Precision score: 0.9980410351582637


In [30]:
# XGBoost
from xgboost import XGBClassifier
xgb_clf = XGBClassifier() 
xgb_clf.fit(train_1,train_labels) 





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=True, validate_parameters=1, verbosity=None)

In [32]:
print('Accuracy score on train_data: ', accuracy_score(y_true = train_labels, y_pred = xgb_clf.predict(train_1).round()))
print('Accuracy score on test_data: ', accuracy_score(y_true = test_labels, y_pred = xgb_clf.predict(test_1).round()))

Accuracy score on train_data:  0.9985166666666667
Accuracy score on test_data:  0.98955


In [57]:
test = test[['Age','Experience','Car_Ownership','CURRENT_JOB_YRS','CURRENT_HOUSE_YRS']] 

In [58]:
test = sc.fit_transform(test)
# Prediciton on test data using XGBoost 
test_predictions_xgb = xgb_clf.predict(test) 

In [59]:
test_predictions_xgb = pd.DataFrame(test_predictions_xgb) 
test_predictions_xgb.columns = ["Default"] 

In [60]:
# Saving prediction as csv file
test_predictions_xgb.to_csv("C:/Users/vansh/OneDrive/Desktop/Data challenge 1/f2021-stat441-d1/final_xgb.csv",header=True, index_label="Id")

In [61]:
test_predictions_xgb

Unnamed: 0,Default
0,1.0
1,1.0
2,1.0
3,0.0
4,1.0
...,...
19995,0.0
19996,0.0
19997,0.0
19998,0.0


In [None]:
# Prediciton on test data using Logistic regression
test_predictions_lr = lr_clf.predict(test)
test_predictions_lr = pd.DataFrame(test_predictions_lr)  
test_predictions_lr.columns = ["Default"] 

In [None]:
# Saving prediction as csv file
test_predictions_lr.to_csv("C:/Users/vansh/OneDrive/Desktop/Data challenge 1/f2021-stat441-d1/final_lr.csv",header=True, index_label="Id")

In [None]:
test_predictions_lr 