In [None]:
# import / load dataset yang akan digunakan dengan nama file student-por.csv
import pandas as pd
from google.colab import files
uploaded = files.upload()
d = pd.read_csv('student-por.csv', sep=';')

Saving student-por.csv to student-por.csv


In [None]:
# dari dataset yang di import berisikan total 649 data siswa
len(d)

649

In [None]:
# menambahkan kolom baru yaitu 'pass' berdasarkan perhitungan penjumlahan G1-G3
d['pass'] = d.apply(lambda row: 1 if (row['G1']+row['G2']+row['G3']) >= 35 else 0, axis=1)
d = d.drop(['G1', 'G2', 'G3'], axis=1)
d.head() # --> menampilkan 5 baris pertama dari dataset yang digunakan

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,6,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,1


In [None]:
# menggunakan one-hot encoding untuk kolom kategorikal (kolom yang menggunakan biner/bool)
# konversikan ke biner
d = pd.get_dummies(d, columns = ['sex', 'school', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
                                 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
                                 'nursery', 'higher', 'internet', 'romantic'])
d.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,True,False,False,True,False,True,True,False,True,False
1,17,1,1,1,2,0,5,3,3,1,...,True,False,True,False,False,True,False,True,True,False
2,15,1,1,1,2,0,4,3,2,2,...,True,False,False,True,False,True,False,True,True,False
3,15,4,2,1,3,0,3,2,2,1,...,False,True,False,True,False,True,False,True,False,True
4,16,3,3,1,2,0,4,3,2,1,...,True,False,False,True,False,True,True,False,True,False


In [None]:
d = d.sample(frac=1)
# memisahkan data untuk training dan data untuk test
d_train = d[:500]
d_test = d[500:]

d_train_att = d_train.drop(['pass'], axis=1)
d_train_pass = d_train['pass']

d_test_att = d_test.drop(['pass'], axis=1)
d_test_pass = d_test['pass']

d_att = d.drop(['pass'], axis=1)
d_pass = d['pass']

import numpy as np
print("Passing %d out of %d (%.2f%%)" % (np.sum(d_pass), len(d_pass), 100*float(np.sum(d_pass)) / len(d_pass)))

Passing 328 out of 649 (50.54%)


Model Decision Tree

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

t = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
t = t.fit(d_train_att, d_train_pass)

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(d_train_att, d_train_pass)
y_pred = decision_tree.predict(d_train_att)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = decision_tree.predict(d_test_att)

# Evaluasi model
accuracy = accuracy_score(d_test_pass, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6778523489932886


In [None]:
# simpan modelnya
tree.export_graphviz(t, out_file="student-performance.dot", label="all", impurity=False, proportion=True,
                     feature_names=list(d_train_att), class_names=["fall", "pass"],
                     filled=True, rounded=True)

In [None]:
# perhitungan t.score berdasarkan d_test_att dan d_test_pass
t.score(d_test_att, d_test_pass)

0.738255033557047

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(t, d_att, d_pass, cv=5)
# menampilkan rata-rata skor siswa dan dua standar deviasinya
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.70 (+/- 0.08)


In [None]:
for max_depth in range(1, 20):
  t = tree.DecisionTreeClassifier(criterion="entropy", max_depth=max_depth)
  scores = cross_val_score(t, d_att, d_pass, cv=5)
  print ("Max depth: %d, Accuracy: %0.2f (+/- %0.2f)" % (max_depth, scores.mean(), scores.std() * 2))

Max depth: 1, Accuracy: 0.64 (+/- 0.02)
Max depth: 2, Accuracy: 0.69 (+/- 0.02)
Max depth: 3, Accuracy: 0.68 (+/- 0.07)
Max depth: 4, Accuracy: 0.69 (+/- 0.06)
Max depth: 5, Accuracy: 0.69 (+/- 0.08)
Max depth: 6, Accuracy: 0.68 (+/- 0.04)
Max depth: 7, Accuracy: 0.66 (+/- 0.09)
Max depth: 8, Accuracy: 0.67 (+/- 0.09)
Max depth: 9, Accuracy: 0.66 (+/- 0.08)
Max depth: 10, Accuracy: 0.68 (+/- 0.05)
Max depth: 11, Accuracy: 0.66 (+/- 0.09)
Max depth: 12, Accuracy: 0.67 (+/- 0.08)
Max depth: 13, Accuracy: 0.67 (+/- 0.05)
Max depth: 14, Accuracy: 0.64 (+/- 0.09)
Max depth: 15, Accuracy: 0.65 (+/- 0.10)
Max depth: 16, Accuracy: 0.65 (+/- 0.11)
Max depth: 17, Accuracy: 0.65 (+/- 0.07)
Max depth: 18, Accuracy: 0.64 (+/- 0.09)
Max depth: 19, Accuracy: 0.65 (+/- 0.08)


In [None]:
depth_acc = np.empty((19,3), float)
i = 0
for max_depth in range(1,20):
  t = tree.DecisionTreeClassifier(criterion="entropy", max_depth=max_depth)
  scores = cross_val_score(t, d_att, d_pass, cv=5)
  depth_acc[i,0] = max_depth
  depth_acc[i,1] = scores.mean()
  depth_acc[i,2] = scores.std() * 2
  i += 1

depth_acc

array([[1.00000000e+00, 6.37889088e-01, 2.26681213e-02],
       [2.00000000e+00, 6.87215265e-01, 1.80791494e-02],
       [3.00000000e+00, 6.81109123e-01, 7.28939736e-02],
       [4.00000000e+00, 6.90316041e-01, 6.28977925e-02],
       [5.00000000e+00, 6.98079905e-01, 8.89856407e-02],
       [6.00000000e+00, 6.74847943e-01, 3.82151110e-02],
       [7.00000000e+00, 6.57841383e-01, 8.79637160e-02],
       [8.00000000e+00, 6.71806798e-01, 6.97905929e-02],
       [9.00000000e+00, 6.53285629e-01, 4.94052484e-02],
       [1.00000000e+01, 6.74824091e-01, 7.35905416e-02],
       [1.10000000e+01, 6.67107931e-01, 8.46354179e-02],
       [1.20000000e+01, 6.56314848e-01, 7.22865782e-02],
       [1.30000000e+01, 6.65569469e-01, 6.79608970e-02],
       [1.40000000e+01, 6.43947525e-01, 1.05125293e-01],
       [1.50000000e+01, 6.43971377e-01, 7.88888564e-02],
       [1.60000000e+01, 6.51687537e-01, 7.67419908e-02],
       [1.70000000e+01, 6.45521765e-01, 8.96771271e-02],
       [1.80000000e+01, 6.40942

In [None]:
import joblib
joblib.dump(decision_tree, "student_performance")

['student_performance']

In [None]:
prediksijl = joblib.load ('student_performance')

In [None]:
from joblib import dump
dump(decision_tree, 'student_performance')


['student_performance']