## Loading Data

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('Student Performance.csv')

## Obtain label features

In [2]:
# convert these data into 2D numpy array
# order: math percentage, reading score percentage, writing score percentage
X = np.array([[df['math percentage'][0], 
              df['reading score percentage'][0], 
              df['writing score percentage'][0]]], 
             dtype = 'float')
for i in range(1,1000):
    X = np.append(X, [[df['math percentage'][i], 
                  df['reading score percentage'][i], 
                  df['writing score percentage'][i]]],axis=0)
print('feature matrix')
print(X)
print('size: ' + str(X.shape))
print()

# Group A to Group E labeled as 0,1,2,3,4
if(df['race/ethnicity'][0] == 'group A'):
    first_elem = 0
elif(df['race/ethnicity'][0] == 'group B'):
    first_elem = 1
elif(df['race/ethnicity'][0] == 'group C'):
    first_elem = 2
elif(df['race/ethnicity'][0] == 'group D'):
    first_elem = 3
else:
    first_elem = 4
    
y = np.array([first_elem], dtype = 'int')
for i in range(1,1000):
    if(df['race/ethnicity'][i] == 'group A'):
        y = np.append(y, 0)
    elif(df['race/ethnicity'][i] == 'group B'):
        y = np.append(y, 1)
    elif(df['race/ethnicity'][i] == 'group C'):
        y = np.append(y, 2)
    elif(df['race/ethnicity'][i] == 'group D'):
        y = np.append(y, 3)
    else:
        y = np.append(y, 4)
print('Labels')
print(y)
print('size: ' + str(y.shape))


feature matrix
[[0.72 0.72 0.74]
 [0.69 0.9  0.88]
 [0.9  0.95 0.93]
 ...
 [0.59 0.71 0.65]
 [0.68 0.78 0.77]
 [0.77 0.86 0.86]]
size: (1000, 3)

Labels
[1 2 1 0 2 1 1 1 3 1 2 3 1 0 0 2 2 1 2 2 3 1 3 2 3 0 1 2 2 3 3 1 4 3 4 4 3
 3 3 1 2 2 1 1 4 1 0 2 3 2 4 4 2 3 2 2 4 3 3 2 4 0 0 2 3 1 3 2 1 2 3 3 0 2
 2 1 4 0 3 4 1 1 0 4 3 2 2 3 0 3 2 2 2 2 1 2 1 4 3 3 1 3 3 1 2 2 3 4 1 1 3
 2 0 3 4 2 1 3 3 2 2 1 2 3 4 1 1 3 3 0 3 2 4 2 3 2 1 4 2 3 3 2 4 0 3 2 1 2
 3 4 0 0 1 3 3 2 4 1 1 3 1 4 1 2 4 2 2 1 1 2 0 4 3 2 2 2 1 2 1 3 2 2 4 3 2
 2 4 3 1 2 4 3 1 3 2 3 2 4 1 1 2 3 2 1 2 3 4 4 1 1 3 2 2 2 4 1 4 2 1 1 3 1
 2 3 1 4 2 3 0 2 3 2 1 4 2 3 3 3 1 2 3 4 3 4 3 2 4 1 1 2 0 3 1 3 3 4 2 2 1
 2 2 2 2 4 3 3 2 3 3 4 2 2 3 3 1 2 2 4 2 1 3 3 3 3 1 1 4 1 1 4 2 3 2 4 3 1
 0 4 2 3 0 3 2 1 2 0 4 2 1 3 1 1 3 2 2 2 3 2 1 3 2 4 2 2 2 2 2 0 2 1 2 2 4
 1 2 1 3 2 1 3 2 2 1 3 3 2 1 2 3 4 1 4 2 2 2 1 0 2 3 3 1 1 2 3 2 0 2 2 0 3
 4 2 3 3 3 4 3 3 0 0 1 2 2 4 0 4 4 2 3 3 4 3 4 2 2 0 1 2 1 3 2 0 0 3 2 2 1
 1 3 3 3 4 3 1 2 4 2 2

In [3]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## Use sklearn functions to split the dataset into testing and training sets with the following parameters: test_size=0.3, random_state=6.

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)

 ## Voting

In [5]:
clf1 = KNeighborsClassifier(n_neighbors=64)
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier()
clf4 = RandomForestClassifier(n_estimators=40)

In [6]:
eclf1 = VotingClassifier (estimators= [('KNN', clf1), ('lr', clf2), ('dt', clf3), ('rf', clf4)], voting = 'hard')
eclf1 = eclf1.fit(X_train, y_train)

In [8]:
eclf2 = VotingClassifier (estimators= [('KNN', clf1), ('lr', clf2), ('dt', clf3), ('rf', clf4)], voting = 'soft')
eclf2 = eclf2.fit(X_train, y_train)

## Accuracy

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
y_predict = eclf1.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.3233333333333333


In [12]:
y_predict = eclf2.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.29333333333333333
