In [1]:
# required imports

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

sns.set_style('whitegrid')
%matplotlib inline

In [2]:
# load and preview dataset

dataset = pd.read_csv('turkiye-student-evaluation_generic.csv')
dataset.head()

Unnamed: 0,instr,class,nb.repeat,attendance,difficulty,Q1,Q2,Q3,Q4,Q5,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Q28
0,1,2,1,0,4,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1,1,2,1,1,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,1,2,1,2,4,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
3,1,2,1,1,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,1,2,1,0,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [3]:
# summary

dataset.describe()

Unnamed: 0,instr,class,nb.repeat,attendance,difficulty,Q1,Q2,Q3,Q4,Q5,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Q28
count,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,...,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0,5820.0
mean,2.485567,7.276289,1.214089,1.675601,2.783505,2.929897,3.073883,3.178694,3.082474,3.105842,...,3.261684,3.285395,3.307388,3.317526,3.20189,3.166838,3.312543,3.222165,3.154811,3.308076
std,0.718473,3.688175,0.532376,1.474975,1.348987,1.341077,1.285251,1.253567,1.284594,1.278989,...,1.268442,1.276848,1.269974,1.268358,1.27259,1.275909,1.257286,1.270695,1.291872,1.278709
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,4.0,1.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,...,3.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0
50%,3.0,7.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
75%,3.0,10.0,1.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,3.0,13.0,3.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


#### Lets begin to cluster the students based on the questionaire data

In [4]:
# Dataset of questions

dataset_questions = dataset.iloc[:,5:33]
dataset_questions.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Q28
0,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [5]:
# lets do a PCA for dimensional reduction and get a 2D vector

pca = PCA(n_components = 2)
dataset_questions_pca = pca.fit_transform(dataset_questions)

In [6]:
# Separate the 2D vector and add each vector as an individual feature column in the Dataframe
PCA1 = dataset_questions_pca[:,0]
PCA2 = dataset_questions_pca[:,1]

dataset['PCA1'] = PCA1
dataset['PCA2'] = PCA2

In [7]:
# Drop all Questions columns and preview

dataset = dataset.drop(dataset.iloc[:,5:33], axis=1)
dataset.head()

Unnamed: 0,instr,class,nb.repeat,attendance,difficulty,PCA1,PCA2
0,1,2,1,0,4,0.989015,0.522798
1,1,2,1,1,3,0.989015,0.522798
2,1,2,1,2,4,-9.591289,0.640802
3,1,2,1,1,3,0.989015,0.522798
4,1,2,1,0,1,11.569319,0.404794


In [8]:
# Separate features and target columns

X = dataset.drop(columns='nb.repeat')
y = dataset['nb.repeat']

In [9]:
# Train Test split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

In [10]:
# Implement Logistic Regression

lr = LogisticRegression(max_iter=200)
lr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Evaluating on Test Data

y_pred = lr.predict(Xtest)
print(metrics.classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           1       0.84      1.00      0.91      1465
           2       0.00      0.00      0.00       170
           3       1.00      0.01      0.02       111

    accuracy                           0.84      1746
   macro avg       0.61      0.34      0.31      1746
weighted avg       0.77      0.84      0.77      1746



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Implement Decision Trees

dt = DecisionTreeClassifier(class_weight='balanced')
dt.fit(Xtrain, ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
# Evaluation on Test Data

y_pred = dt.predict(Xtest)
print(metrics.classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           1       0.88      0.78      0.83      1465
           2       0.21      0.30      0.25       170
           3       0.12      0.23      0.16       111

    accuracy                           0.70      1746
   macro avg       0.40      0.44      0.41      1746
weighted avg       0.77      0.70      0.73      1746



In [14]:
# Implement Random Forest

rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
# Evaluation

y_pred = rfc.predict(Xtest)
print(metrics.classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           1       0.86      0.89      0.88      1465
           2       0.24      0.17      0.20       170
           3       0.14      0.14      0.14       111

    accuracy                           0.77      1746
   macro avg       0.41      0.40      0.40      1746
weighted avg       0.76      0.77      0.76      1746



##### Looks like PCA doesn't help