## Loading Data

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('Student Performance.csv')

# Obtain label features

In [2]:
# convert these data into 2D numpy array
# order: math percentage, reading score percentage, writing score percentage
X = np.array([[df['math percentage'][0], 
              df['reading score percentage'][0], 
              df['writing score percentage'][0]]], 
             dtype = 'float')
for i in range(1,1000):
    X = np.append(X, [[df['math percentage'][i], 
                  df['reading score percentage'][i], 
                  df['writing score percentage'][i]]],axis=0)
print('feature matrix')
print(X)
print('size: ' + str(X.shape))
print()

# completed = 0, none = 1
if(df['test preparation course'][0] == 'completed'):
    first_elem = 0
else:
    first_elem = 1
    
y = np.array([first_elem], dtype = 'int')
for i in range(1,1000):
    if(df['test preparation course'][i] == 'completed'):
        y = np.append(y, 0)
    else:
        y = np.append(y, 1)
print('Labels')
print(y)
print('size: ' + str(y.shape))

feature matrix
[[0.72 0.72 0.74]
 [0.69 0.9  0.88]
 [0.9  0.95 0.93]
 ...
 [0.59 0.71 0.65]
 [0.68 0.78 0.77]
 [0.77 0.86 0.86]]
size: (1000, 3)

Labels
[1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 0 1 1 1 1 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0
 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1 1 1 1 1
 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 0
 0 1 0 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 0 0 1 1 0 0 1
 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1
 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 1
 0 0 0 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0
 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0
 1 0 0 0 0 1 0 0 1 0 1

In [3]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Use sklearn functions to split the dataset into testing and training sets with the following parameters: test_size=0.3, random_state=6.

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

 # Voting

In [5]:
clf1 = KNeighborsClassifier(n_neighbors=64)
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier(max_depth=3)
clf4 = RandomForestClassifier(n_estimators=40)

In [6]:
eclf1 = VotingClassifier (estimators= [('KNN', clf1), ('lr', clf2), ('dt', clf3), ('rf', clf4)], voting = 'hard')
eclf1 = eclf1.fit(X_train, y_train)

In [7]:
eclf2 = VotingClassifier (estimators= [('KNN', clf1), ('lr', clf2), ('dt', clf3), ('rf', clf4)], voting = 'soft')
eclf2 = eclf2.fit(X_train, y_train)

## Accuracy

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
y_predict = eclf1.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6833333333333333


In [10]:
y_predict = eclf2.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.6866666666666666


#### Using BootStrap

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

bootstarp_size = int(np.floor( 0.8 * len(X_train) ))
accuracy_list = list()

for i in range(0, 19):
    
    # Step1 (Bootstrapping)
    X_bag = resample(X_train, n_samples = bootstarp_size , random_state=i , replace = True)
    y_bag = resample(y_train, n_samples = bootstarp_size , random_state=i , replace = True)
    
    # Step2 (Training)
    Base_DecisionTree = DecisionTreeClassifier(random_state=3)
    Base_DecisionTree.fit(X_bag, y_bag)

    Base_logreg = LogisticRegression(max_iter=150)
    Base_logreg.fit(X_bag, y_bag)

    Base_knn = KNeighborsClassifier(n_neighbors=60)
    Base_knn.fit(X_bag, y_bag)

    Base_rf = RandomForestClassifier(n_estimators=19)
    Base_rf.fit(X_bag, y_bag)
        
    Base_eclf1 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'hard')
    Base_eclf1 = Base_eclf1.fit(X_bag, y_bag)
    
    Base_eclf2 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'soft')
    Base_eclf2 = Base_eclf1.fit(X_bag, y_bag)
    
    # Step3 (Base Learner Prediction)
    y_predict1 = Base_DecisionTree.predict(X_test)
    y_predict2 = Base_logreg.predict(X_test)
    y_predict3 = Base_knn.predict(X_test)
    y_predict4 = Base_rf.predict(X_test)
    y_predict5 = Base_eclf1.predict(X_test)
    y_predict6 = Base_eclf2.predict(X_test)
    
    accuracy1 = accuracy_score(y_test, y_predict1)
    accuracy2 = accuracy_score(y_test, y_predict2)
    accuracy3 = accuracy_score(y_test, y_predict3)
    accuracy4 = accuracy_score(y_test, y_predict4)
    accuracy5 = accuracy_score(y_test, y_predict5)
    accuracy6 = accuracy_score(y_test, y_predict6)

    # Step4 (Voting)
    curr_acc_list = [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5, accuracy6]
    accuracy_list.append(max(curr_acc_list))
print(max(accuracy_list))

0.6733333333333333


#### Using BootStrap with Cross Validation

In [12]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

bootstarp_size = int(np.floor( 0.8 * len(X_train) ))
final_accuracy_list = list()

for i in range(0, 19):
    
    # Step1 (Bootstrapping)
    X_bag = resample(X_train, n_samples = bootstarp_size , random_state=i , replace = True)
    y_bag = resample(y_train, n_samples = bootstarp_size , random_state=i , replace = True)
    
    # Step2 (Training)
    Base_DecisionTree = DecisionTreeClassifier(random_state=3)

    Base_logreg = LogisticRegression(max_iter=150)

    Base_knn = KNeighborsClassifier(n_neighbors=60)

    Base_rf = RandomForestClassifier(n_estimators=19)
        
    Base_eclf1 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'hard')
    
    Base_eclf2 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'soft')
        
    # Step3 (Base Learner Prediction)
    accuracy_list1 = cross_val_score(Base_DecisionTree, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list2 = cross_val_score(Base_logreg, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list3 = cross_val_score(Base_knn, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list4 = cross_val_score(Base_rf, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list5 = cross_val_score(Base_eclf1, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list6 = cross_val_score(Base_eclf2, X_bag, y_bag, cv=10, scoring='accuracy')
    
    accuracy_cv1 = accuracy_list1.mean()
    accuracy_cv2 = accuracy_list2.mean()
    accuracy_cv3 = accuracy_list3.mean()
    accuracy_cv4 = accuracy_list4.mean()
    accuracy_cv5 = accuracy_list5.mean()
    accuracy_cv6 = accuracy_list6.mean()
        
    # Step4 (Voting)
    curr_max_acc = [accuracy_cv1, accuracy_cv2, accuracy_cv3, accuracy_cv4, accuracy_cv5, accuracy_cv6]
    final_accuracy_list.append(max(curr_max_acc))
    
print(max(final_accuracy_list))

0.8607142857142858


## Conclusion
### Model accuracy: 0.86 with Cross Validation
### Model accuracy: 0.71 without Cross Validation