## Loading Data

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('Student Performance.csv')

## Obtain label features

In [2]:
# convert these data into 2D numpy array
# order: math percentage, reading score percentage, writing score percentage
X = np.array([[df['math percentage'][0], 
              df['reading score percentage'][0], 
              df['writing score percentage'][0]]], 
             dtype = 'float')
for i in range(1,1000):
    X = np.append(X, [[df['math percentage'][i], 
                  df['reading score percentage'][i], 
                  df['writing score percentage'][i]]],axis=0)
print('feature matrix')
print(X)
print('size: ' + str(X.shape))
print()

# lavel 0: master's degree
if(df['parental level of education'][0] == "master's degree"):
    first_elem = 0
# lavel 1: bachelor's degree
elif(df['parental level of education'][0] == "bachelor's degree"):
    first_elem = 1
# lavel 2: associate's degree
elif(df['parental level of education'][0] == "associate's degree"):
    first_elem = 2
# lavel 3: some college
elif(df['parental level of education'][0] == 'some college'):
    first_elem = 3
# lavel 4: high school
elif(df['parental level of education'][0] == 'high school'):
    first_elem = 4
# lavel 5: some high school
else:
    first_elem = 5
    
y = np.array([first_elem], dtype = 'int')
for i in range(1,1000):
    if(df['parental level of education'][i] == "master's degree"):
        y = np.append(y, 0)
    elif(df['parental level of education'][i] == "bachelor's degree"):
        y = np.append(y, 1)
    elif(df['parental level of education'][i] == "associate's degree"):
        y = np.append(y, 2)
    elif(df['parental level of education'][i] == 'some college'):
        y = np.append(y, 3)
    elif(df['parental level of education'][i] == 'high school'):
        y = np.append(y, 4)
    else:
        y = np.append(y, 5)
print('Labels')
print(y)
print('size: ' + str(y.shape))

feature matrix
[[0.72 0.72 0.74]
 [0.69 0.9  0.88]
 [0.9  0.95 0.93]
 ...
 [0.59 0.71 0.65]
 [0.68 0.78 0.77]
 [0.77 0.86 0.86]]
size: (1000, 3)

Labels
[1 3 0 2 3 2 3 3 4 4 2 2 4 3 0 5 4 5 0 2 4 3 3 5 1 0 3 1 4 0 3 3 0 3 3 2 2
 5 2 2 2 2 2 3 2 2 2 4 2 4 3 2 3 4 5 4 2 2 3 5 1 5 2 2 5 5 5 3 2 2 3 3 2 5
 5 2 5 1 5 0 2 4 3 2 4 3 3 2 3 5 1 4 4 2 3 2 5 3 3 1 3 1 2 4 3 3 0 2 2 5 2
 4 2 3 1 4 1 1 5 3 1 2 3 4 3 4 5 3 0 1 0 5 3 3 1 1 5 4 2 3 5 3 3 4 3 3 5 2
 1 2 5 1 2 1 5 3 4 3 2 2 2 3 0 4 0 1 4 0 4 3 4 5 3 2 1 0 4 2 0 5 0 3 4 2 5
 2 4 5 5 1 2 3 5 3 0 2 5 4 3 1 2 3 2 2 3 5 1 4 3 3 5 3 4 2 4 5 2 4 4 5 4 2
 5 5 2 0 3 4 5 3 3 2 1 5 1 2 1 5 3 2 4 1 4 3 5 2 2 4 4 4 5 3 5 0 4 3 2 2 3
 0 5 3 5 4 4 5 1 4 2 3 1 3 2 3 3 1 5 4 5 1 4 4 1 3 5 2 2 5 1 5 2 5 5 1 4 2
 5 2 4 2 3 5 2 2 2 3 3 5 2 4 2 1 1 2 1 4 0 2 1 2 4 4 3 5 4 3 3 3 2 5 4 2 2
 2 1 3 5 2 5 5 4 4 4 2 3 4 3 1 4 2 1 3 3 2 3 1 2 3 3 3 4 5 3 5 3 1 4 1 5 5
 3 3 5 3 1 2 5 0 5 1 2 2 0 5 5 3 1 2 4 0 5 3 3 2 5 4 4 2 5 5 5 3 3 4 4 5 2
 2 4 2 0 3 2 5 1 4 1 2

In [3]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## Use sklearn functions to split the dataset into testing and training sets with the following parameters: test_size=0.3, random_state=6.

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

## Ensemble Learning
### Ensemble Learning uses a group of machine learning algorithms, and then combine the results of them using some techniques such as Voting to achieve higher accuracy.

### Using BootStrap

In [5]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

bootstarp_size = int(np.floor( 0.8 * len(X_train) ))
accuracy_list = list()

for i in range(0, 19):
    
    # Step1 (Bootstrapping)
    X_bag = resample(X_train, n_samples = bootstarp_size , random_state=i , replace = True)
    y_bag = resample(y_train, n_samples = bootstarp_size , random_state=i , replace = True)
    
    # Step2 (Training)
    Base_DecisionTree = DecisionTreeClassifier(random_state=3)

    Base_logreg = LogisticRegression(max_iter=150)

    Base_knn = KNeighborsClassifier(n_neighbors=57)

    Base_rf = RandomForestClassifier(n_estimators=19)
        
    Base_eclf1 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'hard')
    Base_eclf1 = Base_eclf1.fit(X_bag, y_bag)
    
    Base_eclf2 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'soft')
    Base_eclf2 = Base_eclf2.fit(X_bag, y_bag)
    
    # Step3 (Base Learner Prediction)
    y_predict1 = Base_eclf1.predict(X_test)
    y_predict2 = Base_eclf2.predict(X_test)
    
    accuracy1 = accuracy_score(y_test, y_predict1)
    accuracy2 = accuracy_score(y_test, y_predict2)

    # Step4 (Voting)
    curr_acc_list = [accuracy1, accuracy2]
    accuracy_list.append(max(curr_acc_list))
print(max(accuracy_list))

0.25666666666666665


In [6]:
highest_val = 0
highest_val_index = 0
for i in range(0, len(accuracy_list)):
    if(highest_val < accuracy_list[i]):
        highest_val = accuracy_list[i]
        highest_val_index = i
print(highest_val_index)

18


#### Using BootStrap with Cross Validation

In [7]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

bootstarp_size = int(np.floor( 0.8 * len(X_train) ))
final_accuracy_list = list()

for i in range(0, 19):
    
    # Step1 (Bootstrapping)
    X_bag = resample(X_train, n_samples = bootstarp_size , random_state=i , replace = True)
    y_bag = resample(y_train, n_samples = bootstarp_size , random_state=i , replace = True)
    
    # Step2 (Training)
    Base_DecisionTree = DecisionTreeClassifier(random_state=3)

    Base_logreg = LogisticRegression(max_iter=150)

    Base_knn = KNeighborsClassifier(n_neighbors=164)

    Base_rf = RandomForestClassifier(n_estimators=19)
        
    Base_eclf1 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'hard')
    
    Base_eclf2 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'soft')
        
    # Step3 (Base Learner Prediction)
    accuracy_list1 = cross_val_score(Base_eclf1, X_bag, y_bag, cv=10, scoring='accuracy')
    accuracy_list2 = cross_val_score(Base_eclf2, X_bag, y_bag, cv=10, scoring='accuracy')
    
    accuracy_cv1 = accuracy_list1.mean()
    accuracy_cv2 = accuracy_list2.mean()
        
    # Step4 (Voting)
    curr_max_acc = [accuracy_cv1, accuracy_cv2]
    final_accuracy_list.append(max(curr_max_acc))
    
print(max(final_accuracy_list))

0.6321428571428571


In [8]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

bootstarp_size = int(np.floor( 0.8 * len(X_train) ))
accuracy_list = list()

i = 18
    
# Step1 (Bootstrapping)
X_bag = resample(X_train, n_samples = bootstarp_size , random_state=i , replace = True)
y_bag = resample(y_train, n_samples = bootstarp_size , random_state=i , replace = True)

# Step2 (Training)
Base_DecisionTree = DecisionTreeClassifier(random_state=3)

Base_logreg = LogisticRegression(max_iter=150)

Base_knn = KNeighborsClassifier(n_neighbors=57)

Base_rf = RandomForestClassifier(n_estimators=19)

Base_eclf1 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'hard')
Base_eclf1 = Base_eclf1.fit(X_bag, y_bag)

Base_eclf2 = VotingClassifier (estimators= [('dt', Base_DecisionTree), ('lr', Base_logreg), ('KNN', Base_knn), ('rf', Base_rf)], voting = 'soft')
Base_eclf2 = Base_eclf2.fit(X_bag, y_bag)

# Step3 (Base Learner Prediction)
y_predict1 = Base_eclf1.predict(X_test)
y_predict2 = Base_eclf2.predict(X_test)

accuracy1 = accuracy_score(y_test, y_predict1)
accuracy2 = accuracy_score(y_test, y_predict2)

# Step4 (Voting)
print("Hard accuracy: " + str(accuracy1))
print("Soft accuracy: " + str(accuracy2))

Hard accuracy: 0.2733333333333333
Soft accuracy: 0.18666666666666668


## Conclusion
### Model accuracy: 0.63 with Cross Validation
### Model accuracy: 0.27 without Cross Validation