-
Notifications
You must be signed in to change notification settings - Fork 0
/
admission_pred.py
807 lines (502 loc) · 22.2 KB
/
admission_pred.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
#!/usr/bin/env python
# coding: utf-8
# <a id="lib"></a>
# # 1. Import Libraries
# In[1]:
# import 'Pandas'
import pandas as pd
# import 'Numpy'
import numpy as np
# import subpackage of Matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# import 'Seaborn'
import seaborn as sns
# suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')
# display all columns of the dataframe
pd.options.display.max_columns = None
# display all rows of the dataframe
pd.options.display.max_rows = None
# display the float values upto 6 decimal places
pd.options.display.float_format = '{:.6f}'.format
# import train-test split
from sklearn.model_selection import train_test_split
# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler
# import various functions from sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# import the XGBoost function for classification
from xgboost import XGBClassifier
# In[2]:
# plot size using 'rcParams'
plt.rcParams['figure.figsize'] = [15,8]
# <a id="prep"></a>
# # 2. Data Preparation
# <a id="read"></a>
# ## 2.1 Read the Data
# In[3]:
# load the csv file
df_admissions = pd.read_csv('Admission_predict.csv')
# display first five observations using head()
df_admissions.head()
# In[4]:
# 'shape' to check the dimensions
df_admissions.shape
# **Interpretation:** The data has 400 observations and 9 variables.
# <a id="dtype"></a>
# ## 2.2 Check the Data Type
# In[5]:
# using 'dtypes' to check the data type of a variable
df_admissions.dtypes
# **Interpretation:** The variables `GRE Score`, `TOEFL Score`, `University Rating`, `SOP`, `LOR` and `CGPA` are numerical.
#
# From the above output, we see that the data type of `Research` is 'int64'.
#
# But according to the data definition, `Research` is a categorical variable, which is wrongly interpreted as 'int64', so we will convert these variables data type to 'object'.
# In[6]:
# convert numerical variable to categorical (object)
df_admissions['Research'] = df_admissions['Research'].astype(object)
# In[7]:
# recheck the data types using 'dtypes'
df_admissions.dtypes
# **Interpretation:** Now, all the variables have the correct data type.
# <a id="drop"></a>
# ## 2.3 Remove Insignificant Variables
# In[8]:
# drop the column 'Serial No.' using drop()
df_admissions = df_admissions.drop('Serial No.', axis = 1)
# <a id="dist"></a>
# ## 2.4 Distribution of Variables
# **Distribution of numeric independent variables.**
# In[9]:
# for the independent numeric variables, plot the histogram to check the distribution of the variables
df_admissions.drop('Chance of Admit', axis = 1).hist()
# adjust the subplots
plt.tight_layout()
# display the plot
plt.show()
# print the skewness for each numeric independent variable
print('Skewness:')
# drop the target variable using drop()
df_admissions.drop('Chance of Admit', axis = 1).skew()
# **Interpretation:** The above plot indicates that all the variables are near normally distributed.
# **Distribution of categoric independent variable.**
# In[10]:
# for the independent categoric variable, plot the count plot to check the distribution of the variable 'Research'
sns.countplot(df_admissions.Research)
# plot and axes labels
plt.title('Count Plot for Categorical Variable (Research)', fontsize = 15)
plt.xlabel('Research', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
# display the plot
plt.show()
# #### Distribution of dependent variable.
# In[11]:
# only the target variable
df_target = df_admissions['Chance of Admit'].copy()
# counts of 0's and 1's in the 'Chance of Admit' variable
df_target.value_counts()
# plot the countplot of the variable 'Chance of Admit'
sns.countplot(x = df_target)
# code to print the values in the graph
plt.text(x = -0.05, y = df_target.value_counts()[0] + 1, s = str(round((df_target.value_counts()[0])*100/len(df_target),2)) + '%')
plt.text(x = 0.95, y = df_target.value_counts()[1] +1, s = str(round((df_target.value_counts()[1])*100/len(df_target),2)) + '%')
# plot and axes labels
plt.title('Count Plot for Target Variable (Chance of Admit)', fontsize = 15)
plt.xlabel('Target Variable', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
# to show the plot
plt.show()
# **Interpretation:** The above plot shows that there is no imbalance in the target variable.
# <a id="null"></a>
# ## 2.5 Missing Value Treatment
# In[12]:
# sorting the variables on the basis of total null values in the variable
Total = df_admissions.isnull().sum().sort_values(ascending=False)
# calculating percentage of missing values
Percent = (df_admissions.isnull().sum()*100/df_admissions.isnull().count()).sort_values(ascending=False)
# concatenating the 'Total' and 'Percent' columns using 'concat' function
missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])
missing_data
# **Interpretation:** The above output shows that there are no missing values in the data.
# <a id="dummy"></a>
# ## 2.6 Dummy Encode the Categorical Variables
# In[13]:
# storing the target variable 'Chance of Admit' in a dataframe 'df_target'
df_target = df_admissions['Chance of Admit']
# storing all the independent variables in a dataframe 'df_feature'
df_feature = df_admissions.drop('Chance of Admit', axis = 1)
# In[14]:
# filtering the numerical features in the dataset
df_num = df_feature.select_dtypes(include = [np.number])
# displaying numerical features
df_num.columns
# In[15]:
# filtering the categorical features in the dataset
df_cat = df_feature.select_dtypes(include = [np.object])
# displaying categorical features
df_cat.columns
# In[16]:
# using 'get_dummies' from pandas to create dummy variables
dummy_var = pd.get_dummies(data = df_cat, drop_first = True)
# In[17]:
# concat the dummy variables with numeric features to create a dataframe of all independent variables
X = pd.concat([df_num, dummy_var], axis = 1)
# display first five observations
X.head()
# <a id="split"></a>
# ## 2.7 Train-Test Split
# In[18]:
# split data into train subset and test subset
X_train, X_test, y_train, y_test = train_test_split(X, df_target, random_state = 10, test_size = 0.2)
# check the dimensions of the train & test subset using 'shape'
print('X_train', X_train.shape)
print('y_train', y_train.shape)
# print dimension of test set
print('X_test', X_test.shape)
print('y_test', y_test.shape)
# #### A generalized function to calculate the metrics for the test set.
# In[19]:
# create a generalized function to calculate the metrics values for test set
def get_test_report(model):
# for test set:
test_pred = model.predict(X_test)
# return the classification report for test data
return(classification_report(y_test, test_pred))
# #### Plotting the confusion matrix.
# In[20]:
def plot_confusion_matrix(model):
y_pred = model.predict(X_test)
# create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
# label the confusion matrix '
conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])
# plot a heatmap to visualize the confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False,
linewidths = 0.1, annot_kws = {'size':25})
# set the font size of x-axis ticks using 'fontsize'
plt.xticks(fontsize = 20)
# set the font size of y-axis ticks using 'fontsize'
plt.yticks(fontsize = 20)
# display the plot
plt.show()
# #### Plotting the ROC curve.
# In[21]:
def plot_roc(model):
y_pred_prob = model.predict_proba(X_test)[:,1]
# the roc_curve() returns the values for false positive rate, true positive rate and threshold
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# plot the ROC curve
plt.plot(fpr, tpr)
# set limits for x and y axes
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
# plot the straight line showing worst prediction for the model
plt.plot([0, 1], [0, 1],'r--')
# add plot and axes labels
plt.title('ROC curve for Admission Prediction Classifier', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
# add the AUC score to the plot
plt.text(x = 0.82, y = 0.3, s = ('AUC Score:',round(roc_auc_score(y_test, y_pred_prob),4)))
# plot the grid
plt.grid(True)
# <a id="boosting"></a>
# # 3. Boosting Methods
# <a id="ada"></a>
# ## 3.1 AdaBoost
# In[22]:
# instantiate the 'AdaBoostClassifier'
ada_model = AdaBoostClassifier(n_estimators = 40, random_state = 10)
# fit the model using fit() on train data
ada_model.fit(X_train, y_train)
# Let us understand the parameters in the `AdaBoostClassifier()`:
#
# `algorithm=SAMME.R`: It is the default boosting algorithm. This algorithm uses predicted class probabilities to build the stumps.
#
# `base_estimator=None`: By default, the estimator is a decision tree with a maximum depth equal to 1 (stump).
#
# `learning_rate=1.0`: It considers the contribution of each estimator in the classifier.
#
# `n_estimators=40`: It is the number of estimators at which boosting is terminated.
#
# `random_state=10`: It returns the same set of samples for each code implementation.
# #### Plotting the confusion matrix.
# In[23]:
# call the function to plot the confusion matrix
plot_confusion_matrix(ada_model)
# **Calculating performance measures on the test set.**
# In[24]:
# compute the performance measures on test data
test_report = get_test_report(ada_model)
# print the performance measures
print(test_report)
# **Interpretation:** The output shows that the model is 81% accurate.
# #### Plotting the ROC curve.
# In[25]:
# call the function to plot the ROC curve
plot_roc(ada_model)
# **Interpretation:** The red dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).<br>
# From the above plot, we can see that the AdaBoost model is away from the dotted line; with the AUC score 0.9132.
# <a id="gradient"></a>
# ## 3.2 Gradient Boosting
# In[26]:
# instantiate the 'GradientBoostingClassifier'
gboost_model = GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)
# fit the model using fit() on train data
gboost_model.fit(X_train, y_train)
# Let us understand the parameters in the `GradientBoostingClassifier()`:
#
# `ccp_alpha=0.0`: The complexity parameter used for pruning. By default, there is no pruning.
#
# `criterion=friedman_mse`: The criteria to measure the quality of a split.
#
# `init=None`: The estimator for initial predictions.
#
# `learning_rate=0.1`: It considers the contribution of each estimator in the classifier.
#
# `loss=deviance`: The loss function to be optimized.
#
# `max_depth=10`: Assigns the maximum depth of the tree.
#
# `max_features=None`: Maximum features to consider for the split.
#
# `max_leaf_nodes=None`: Maximum number of leaf/terminal nodes in the tree.
#
# `min_impurity_decrease=0.0`: A node splits if it decreases the impurity by the value given by this parameter.
#
# `min_impurity_split=None`: Minimum value of impurity for a node to split.
#
# `min_samples_leaf=1`: Minimum number of samples needed at the leaf/terminal node.
#
# `min_samples_split=2`: Minimum number of samples needed at the internal node to split.
#
# `min_weight_fraction_leaf=0.0`: Minimum weighted fraction needed at a leaf node.
#
# `n_estimators=150`: The number of estimators to consider.
#
# `n_iter_no_change=None`: Number of iterations after which the training should terminate if the score is not improving.
#
# `presort='deprecated'`: It considers whether to presort the data. (This parameter may not be available in the latest versions).
#
# `random_state=10`: It returns the same set of samples for each code implementation.
#
# `subsample=1.0`: Fraction of samples to use for fitting each estimator.
#
# `tol=0.0001`: Value of tolerance to terminate the training.
#
# `validation_fraction=0.1`: Fraction of training dataset used for validation.
#
# `verbose=0`: Enables verbose output (by default, no progress will be printed).
#
# `warm_start=False`: Whether to reuse the solution of previous code implementation (by default, it does not consider the previous solution).
# #### Plotting the confusion matrix.
# In[27]:
# call the function to plot the confusion matrix
plot_confusion_matrix(gboost_model)
# **Calculating performance measures on the test set.**
# In[28]:
# compute the performance measures on test data
test_report = get_test_report(gboost_model)
# print the performance measures
print(test_report)
# **Interpretation:** The classification report shows that the model is 79% accurate. Also, the sensitivity and specificity are equal.
# #### Plotting the ROC curve.
# In[29]:
# call the function to plot the ROC curve
plot_roc(gboost_model)
# **Interpretation:** The red dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).<br>
# From the above plot, we can see that the gradient boosting model is away from the dotted line; with the AUC score 0.8954.
# <a id="xgboost"></a>
# ## 3.3 XGBoost
# In[30]:
# instantiate the 'XGBClassifier'
xgb_model = XGBClassifier(max_depth = 10, gamma = 1)
# fit the model using fit() on train data
xgb_model.fit(X_train, y_train)
# Let us understand the parameters in the `XGBClassifier()`:
#
# `base_score=0.5`: Initial prediction for base learners.
#
# `booster=gbtree`: Considers the regression tree as the base learners.
#
# `colsample_bylevel=1`: Fraction of variables to consider for each level.
#
# `colsample_bynode=1`: Fraction of variables to consider for each split.
#
# `colsample_bytree=1`: Fraction of variables to consider for each tree.
#
# `gamma=1`: Value of minimum loss reduction required for the partition of the leaf node.
#
# `gpu_id=-1`: It considers all the GPU's.
#
# `importance_type=gain`: Importance type for calculating feature importance.
#
# `interaction_constraints=''`: By default, no interaction between the features is allowed.
#
# `learning_rate=0.300000012`: It considers the contribution of each estimator in the classifier.
#
# `max_delta_step=0`: Maximum delta step allowed for each tree's weight estimation to be.
#
# `max_depth=10`: Maximum depth of each tree.
#
# `min_child_weight=1`: Minimum sum of hessian (p*(1-p)) required in a leaf node.
#
# `missing=nan`: Value to consider as a missing value.
#
# `monotone_constraints='()'`: Constraint of variable monotonicity. (adding increasing/decreasing constraint on the variables )
#
# `n_estimators=100`: The number of estimators to consider.
#
# `n_jobs=0`: Number of parallel threads to run the classifier.
#
# `num_parallel_tree=1`: It is used for boosting random forest.
#
# `objective='binary:logistic'`: Considers the binary logistic regression as a learning objective.
#
# `random_state=0`: It returns the same set of samples for each code implementation.
#
# `reg_alpha=0`: Lasso regularization term for weights.
#
# `reg_lambda=1`: Ridge regularization term for weights.
#
# `scale_pos_weight=1`: Ratio of the number of negative class to the positive class.
#
# `subsample=1`: Fraction of total training data points.
#
# `tree_method='exact'`: Considers the exact greedy algorithm.
#
# `validate_parameters=1`: Performs validation on input paramerters.
#
# `verbosity=None`: Enables verbose output (by default, no progress will be printed).
# #### Plotting the confusion matrix.
# In[31]:
# call the function to plot the confusion matrix
plot_confusion_matrix(xgb_model)
# **Calculating performance measures on the test set.**
# In[32]:
# compute the performance measures on test data
test_report = get_test_report(xgb_model)
# print the performance measures
print(test_report)
# **Interpretation:** The above output shows that the f1-score and accuracy of the model is 0.84
# #### Plotting the ROC curve.
# In[33]:
# pass the XGBoost model to the function
plot_roc(xgb_model)
# **Interpretation:** The red dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).<br>
# From the above plot, we can see that the XGBoost model is away from the dotted line; with the AUC score 0.8888.
# <a id="tuning"></a>
# ### 3.3.1 Tuning the Hyperparameters (GridSearchCV)
# In[34]:
# create a dictionary with hyperparameters and its values
tuning_parameters = {'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
'max_depth': range(3,10),
'gamma': [0, 1, 2, 3, 4]}
# instantiate the 'XGBClassifier'
xgb_model = XGBClassifier()
# use GridSearchCV() to find the optimal value of the hyperparameters
xgb_grid = GridSearchCV(estimator = xgb_model, param_grid = tuning_parameters, cv = 3, scoring = 'roc_auc')
# fit the model on X_train and y_train using fit()
xgb_grid.fit(X_train, y_train)
# get the best parameters
print('Best parameters for XGBoost classifier: ', xgb_grid.best_params_, '\n')
# #### Building the model using the tuned hyperparameters.
# In[35]:
# instantiate the 'XGBClassifier'
xgb_grid_model = XGBClassifier(learning_rate = xgb_grid.best_params_.get('learning_rate'),
max_depth = xgb_grid.best_params_.get('max_depth'),
gamma = xgb_grid.best_params_.get('gamma'))
# use fit() to fit the model on the train set
xgb_model = xgb_grid_model.fit(X_train, y_train)
# print the performance measures for test set for the model with best parameters
print('Classification Report for test set:\n', get_test_report(xgb_model))
# **Interpretation:** The above output shows that the f1-score and accuracy of the model is 0.84.
# #### Plotting the ROC curve.
# In[36]:
# call the function to plot the ROC curve
plot_roc(xgb_model)
# **Interpretation:** The red dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).<br>
# From the above plot, we can see that the XGBoost model (GridSearchCV) is away from the dotted line; with the AUC score 0.9145.
# ### Identifying the Important Features using XGBoost
# In[37]:
# create a dataframe that stores the feature names and their importance
important_features = pd.DataFrame({'Features': X_train.columns,
'Importance': xgb_model.feature_importances_})
# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)
# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)
# add plot and axes labels
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)
# display the plot
plt.show()
# **Interpretation:** The above bar plot shows that, the variable `CGPA` is of highest importance.
# <a id="stack"></a>
# # 4. Stack Generalization
# In[38]:
# initialize the standard scalar
X_scaler = StandardScaler()
# scale all the numerical columns
num_scaled = X_scaler.fit_transform(df_num)
# create a dataframe of scaled numerical variables
df_num_scaled = pd.DataFrame(num_scaled, columns = df_num.columns)
# Concatenate scaled numerical and dummy encoded categorical variables.
# In[39]:
# concat the dummy variables with scaled numeric features to create a dataframe of all independent variables
X = pd.concat([df_num_scaled, dummy_var], axis = 1)
# display first five observations
X.head()
# Let us split the dataset in train and test set.
# In[40]:
# split data into train subset and test subset
X_train, X_test, y_train, y_test = train_test_split(X, df_target, random_state = 10, test_size = 0.2)
# check the dimensions of the train & test subset using 'shape'
print('X_train', X_train.shape)
print('y_train', y_train.shape)
# print dimension of test set
print('X_test', X_test.shape)
print('y_test', y_test.shape)
# #### Stacking Classifier using the Random forest, KNN and Naive bayes as base learners.
# In[41]:
# consider the various algorithms as base learners
base_learners = [('rf_model', RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 'sqrt',
max_leaf_nodes = 8, min_samples_leaf = 5, min_samples_split = 2,
n_estimators = 50, random_state = 10)),
('KNN_model', KNeighborsClassifier(n_neighbors = 17, metric = 'euclidean')),
('NB_model', GaussianNB())]
# initialize stacking classifier
stack_model = StackingClassifier(estimators = base_learners, final_estimator = GaussianNB())
# fit the model on train dataset
stack_model.fit(X_train, y_train)
# #### Plotting the confusion matrix.
# In[42]:
# call the function to plot the confusion matrix
plot_confusion_matrix(stack_model)
# **Calculating performance measures on the test set.**
# In[43]:
# compute the performance measures on test data
test_report = get_test_report(stack_model)
# print the performance measures
print(test_report)
# **Interpretation:** The above output shows that the f1-score and accuracy of the model is 0.86
# #### Plotting the ROC curve.
# In[44]:
# call the function to plot the ROC curve
plot_roc(stack_model)
# **Interpretation:** The red dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).<br>
# From the above plot, we can see that the stacking model is away from the dotted line; with the AUC score 0.9492.