In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
dummies_par = pd.get_dummies(df['parental level of education'],drop_first=True)
dummies_lunch = pd.get_dummies(df['lunch'],drop_first=True)
dummies_race = pd.get_dummies(df['race/ethnicity'],drop_first=True)
dummies_gender = pd.get_dummies(df['gender'],drop_first=True)
dummies_test = pd.get_dummies(df['test preparation course'],drop_first=True)
df = pd.concat([df,dummies_par,dummies_gender,dummies_lunch,dummies_race,dummies_test],axis=1)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,bachelor's degree,high school,master's degree,some college,some high school,male,standard,group B,group C,group D,group E,none
0,female,group B,bachelor's degree,standard,none,72,72,74,1,0,0,0,0,0,1,1,0,0,0,1
1,female,group C,some college,standard,completed,69,90,88,0,0,0,1,0,0,1,0,1,0,0,0
2,female,group B,master's degree,standard,none,90,95,93,0,0,1,0,0,0,1,1,0,0,0,1
3,male,group A,associate's degree,free/reduced,none,47,57,44,0,0,0,0,0,1,0,0,0,0,0,1
4,male,group C,some college,standard,none,76,78,75,0,0,0,1,0,1,1,0,1,0,0,1


In [5]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'bachelor's degree', 'high school', 'master's degree',
       'some college', 'some high school', 'male', 'standard', 'group B',
       'group C', 'group D', 'group E', 'none'],
      dtype='object')

In [6]:
df = df.drop(['parental level of education','race/ethnicity','lunch','test preparation course','gender'],axis=1)

In [7]:
df.head()

Unnamed: 0,math score,reading score,writing score,bachelor's degree,high school,master's degree,some college,some high school,male,standard,group B,group C,group D,group E,none
0,72,72,74,1,0,0,0,0,0,1,1,0,0,0,1
1,69,90,88,0,0,0,1,0,0,1,0,1,0,0,0
2,90,95,93,0,0,1,0,0,0,1,1,0,0,0,1
3,47,57,44,0,0,0,0,0,1,0,0,0,0,0,1
4,76,78,75,0,0,0,1,0,1,1,0,1,0,0,1


In [8]:
df['average score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3

In [9]:
def average_to_grade(x):
    
    if 0 <= x <= 40:
        return 0
    elif 40 < x <= 50:
        return 1
    elif 50 < x <= 60:
        return 2
    elif 60 < x <= 70:
        return 3
    elif 70 < x <= 80:
        return 4
    elif 80 < x <= 90:
        return 5
    else:
        return 6

In [11]:
df['grade'] = df['average score'].apply(average_to_grade)

In [12]:
df['grade'].value_counts()

3    260
4    253
2    184
5    144
1     77
6     50
0     32
Name: grade, dtype: int64

In [13]:
from imblearn.over_sampling import SMOTE

In [14]:
df = df.drop(['reading score','writing score','math score','average score'],axis=1)

In [15]:
df.head()

Unnamed: 0,bachelor's degree,high school,master's degree,some college,some high school,male,standard,group B,group C,group D,group E,none,grade
0,1,0,0,0,0,0,1,1,0,0,0,1,4
1,0,0,0,1,0,0,1,0,1,0,0,0,5
2,0,0,1,0,0,0,1,1,0,0,0,1,6
3,0,0,0,0,0,1,0,0,0,0,0,1,1
4,0,0,0,1,0,1,1,0,1,0,0,1,4


In [16]:
data = df.values
X = data[:, :-1]
y = data[:, -1]
X_columns = df.columns[:-1]
y_columns = df.columns[-1]
oversample = SMOTE()
X, y = oversample.fit_sample(X, y)

In [19]:
X_sampled = pd.DataFrame(X, columns=X_columns)
y_sampled = pd.DataFrame(y, columns=[y_columns])
df_sampled = pd.concat([X_sampled,y_sampled],axis=1)

In [20]:
df_sampled['grade'].value_counts()

6    260
5    260
4    260
3    260
2    260
1    260
0    260
Name: grade, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
X = df_sampled.drop('grade',axis=1)
y = df_sampled['grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=101)

Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

LogisticRegression()

In [26]:
log_model_preds = log_model.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,log_model_preds))
print("\n")
print(classification_report(y_test,log_model_preds))

[[44  3  5  0  1  7  2]
 [26 17  2  8  5  9  5]
 [16  8  7 10  9  4  9]
 [ 7 14  3 10 13 14 11]
 [ 5  6  2  7 17 12 16]
 [ 3  3  3  4 11 16 21]
 [ 0  1  0  1  1  8 49]]


              precision    recall  f1-score   support

           0       0.44      0.71      0.54        62
           1       0.33      0.24      0.27        72
           2       0.32      0.11      0.16        63
           3       0.25      0.14      0.18        72
           4       0.30      0.26      0.28        65
           5       0.23      0.26      0.24        61
           6       0.43      0.82      0.57        60

    accuracy                           0.35       455
   macro avg       0.33      0.36      0.32       455
weighted avg       0.33      0.35      0.32       455



In [27]:
print(log_model.score(X_test,y_test))

0.3516483516483517
