In [7]:
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)+
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
df = pd.read_csv('StudentsPerformance.csv', encoding='ISO-8859-1', header=0)

In [8]:
df.shape
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [9]:
df.tail(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [10]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [None]:
df.groupby('gender').size()

In [11]:
X = df.iloc[0:1001, 1:9]
print(X)
y = np.array(df.iloc[0:1001, 0])
print('\n\n',y)

    race/ethnicity parental level of education         lunch  \
0          group B           bachelor's degree      standard   
1          group C                some college      standard   
2          group B             master's degree      standard   
3          group A          associate's degree  free/reduced   
4          group C                some college      standard   
..             ...                         ...           ...   
995        group E             master's degree      standard   
996        group C                 high school  free/reduced   
997        group C                 high school  free/reduced   
998        group D                some college      standard   
999        group D                some college  free/reduced   

    test preparation course  math score  reading score  writing score  
0                      none          72             72             74  
1                 completed          69             90             88  
2              

In [25]:
train_size = 0.8
seed = 14
hot_X = np.array(pd.get_dummies(X))
X_train, X_test, y_train, y_test = train_test_split(hot_X, y, train_size=train_size, random_state=seed)

In [26]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits = 10)

In [27]:
def get_score(model, X_train, X_test, y_train, y_test): #Accuracy
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

def get_precision(model, X_train, X_test, y_train, y_test):
    clftemp = model.fit(X_train, y_train)
    y_pred = clftemp.predict(X_test)
    return precision_score(y_test, y_pred, average ='micro')

def get_recall(model, X_train, X_test, y_train, y_test):
    clftemp = model.fit(X_train, y_train)
    y_pred = clftemp.predict(X_test)
    return recall_score(y_test, y_pred, average ='micro')

def get_f1score(model, X_train, X_test, y_train, y_test):
    clftemp = model.fit(X_train, y_train)
    y_pred = clftemp.predict(X_test)
    return f1_score(y_test, y_pred, average ='micro')

scores_dtc = []
precision_dtc = []
recall_dtc = []
f1score_dtc = []

scores_nn = []
precision_nn = []
recall_nn = []
f1score_nn = []

for train_index, test_index in folds.split(X_train, y_train):
    kX_train = X_train[train_index]
    kX_test = X_train[test_index]
    ky_train = y_train[train_index]
    ky_test = y_train[test_index]
    scores_dtc.append(get_score(DecisionTreeClassifier(random_state=seed,criterion="gini", splitter="best"), kX_train, kX_test, ky_train, ky_test))
    precision_dtc.append(get_precision(DecisionTreeClassifier(random_state=seed,criterion="gini", splitter="best"), kX_train, kX_test, ky_train, ky_test))
    recall_dtc.append(get_recall(DecisionTreeClassifier(random_state=seed,criterion="gini", splitter="random"), kX_train, kX_test, ky_train, ky_test))
    f1score_dtc.append(get_f1score(DecisionTreeClassifier(random_state=seed,criterion="entropy", splitter="best"), kX_train, kX_test, ky_train, ky_test))
    scores_nn.append(get_score(MLPClassifier(activation="logistic", solver="adam", learning_rate="constant"), kX_train, kX_test, ky_train, ky_test))
    precision_nn.append(get_precision(MLPClassifier(activation="logistic", solver="adam", learning_rate="constant"), kX_train, kX_test, ky_train, ky_test))
    recall_nn.append(get_recall(MLPClassifier(activation="logistic", solver="adam", learning_rate="constant"), kX_train, kX_test, ky_train, ky_test))
    f1score_nn.append(get_f1score(MLPClassifier(activation="logistic", solver="adam", learning_rate="constant"), kX_train, kX_test, ky_train, ky_test))
    



In [18]:
print("DTC Accuracy:", sum(scores_dtc)/len(scores_dtc))
print("DTC Precision:", sum(precision_dtc)/len(precision_dtc))
print("DTC Recall:", sum(recall_dtc)/len(recall_dtc))
print("DTC F1 Score:", sum(f1score_dtc)/len(f1score_dtc))

DTC Accuracy: 0.8135603219253008
DTC Precision: 0.8135603219253008
DTC Recall: 0.7697751601812783
DTC F1 Score: 0.8010595405532115


In [19]:
print("NN Accuracy:", sum(scores_nn)/len(scores_nn))
print("NN Precision:", sum(precision_nn)/len(precision_nn))
print("NN Recall:", sum(recall_nn)/len(recall_nn))
print("NN F1 Score:", sum(f1score_nn)/len(f1score_nn))

NN Accuracy: 0.8911298640412564
NN Precision: 0.8960986091576808
NN Recall: 0.8998489998437254
NN F1 Score: 0.8948177449601499


In [20]:
clf1 = DecisionTreeClassifier(random_state=seed,criterion="gini", splitter="best")

clf1.fit(X_train, y_train)

y_pred1 = clf1.predict(X_test)
results = pd.DataFrame({'Predicted label': y_pred1, 'True label': y_test})
print(results)

    Predicted label True label
0            female     female
1            female     female
2              male       male
3            female     female
4            female     female
..              ...        ...
195            male       male
196            male     female
197            male       male
198          female     female
199          female       male

[200 rows x 2 columns]


In [21]:
accuracy = accuracy_score(y_test, y_pred1)
print("Decision Tree Accuracy: " + str(accuracy))

precision = precision_score(y_test, y_pred1, average='micro')
print("Precision", precision)


recall = recall_score(y_test, y_pred1, average='micro')
print("Recall: ", recall)


fscore = f1_score(y_test, y_pred1, average='micro')
print("F-score: ", fscore)

confusion_matrix(y_test, y_pred1)

Decision Tree Accuracy: 0.825
Precision 0.825
Recall:  0.825
F-score:  0.825


array([[80, 15],
       [20, 85]])

In [22]:
clf2 = MLPClassifier(activation="logistic", solver="adam", learning_rate="constant")

clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_test)
results = pd.DataFrame({'Predicted label': y_pred2, 'True label': y_test})
print(results)

    Predicted label True label
0            female     female
1            female     female
2              male       male
3            female     female
4            female     female
..              ...        ...
195            male       male
196          female     female
197          female       male
198          female     female
199            male       male

[200 rows x 2 columns]


In [24]:
accuracy = accuracy_score(y_test, y_pred2)
print("Decision Tree Accuracy: " + str(accuracy))

precision = precision_score(y_test, y_pred2, average='macro')
print("Precision: ", precision)


recall = recall_score(y_test, y_pred2, average='macro')
print("Recall: ", recall)


fscore = f1_score(y_test, y_pred2, average='macro')
print("F-score: ", fscore)

confusion_matrix(y_test, y_pred2)

Decision Tree Accuracy: 0.915
Precision:  0.9146634615384616
Recall:  0.9150375939849624
F-score:  0.9148275257396227


array([[87,  8],
       [ 9, 96]])