# Predicting Academic Performance

In [32]:
#import library
import numpy as np
import pandas as pd
import plotly.express as ps
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [33]:
#loading the dataset
df=pd.read_csv('../input/xAPI-Edu-Data/xAPI-Edu-Data.csv')
df

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,F,Jordan,Jordan,MiddleSchool,G-08,A,Chemistry,S,Father,5,4,5,8,No,Bad,Above-7,L
476,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,F,Father,50,77,14,28,No,Bad,Under-7,M
477,F,Jordan,Jordan,MiddleSchool,G-08,A,Geology,S,Father,55,74,25,29,No,Bad,Under-7,M
478,F,Jordan,Jordan,MiddleSchool,G-08,A,History,F,Father,30,17,14,57,No,Bad,Above-7,L


In [34]:
#getting information abut the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    480 non-null    object
 1   NationalITy               480 non-null    object
 2   PlaceofBirth              480 non-null    object
 3   StageID                   480 non-null    object
 4   GradeID                   480 non-null    object
 5   SectionID                 480 non-null    object
 6   Topic                     480 non-null    object
 7   Semester                  480 non-null    object
 8   Relation                  480 non-null    object
 9   raisedhands               480 non-null    int64 
 10  VisITedResources          480 non-null    int64 
 11  AnnouncementsView         480 non-null    int64 
 12  Discussion                480 non-null    int64 
 13  ParentAnsweringSurvey     480 non-null    object
 14  ParentschoolSatisfaction  

In [35]:
#checking whether any null values
df.isna().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

# Encoding

In [36]:
#creating a function to get unique values in a column
def get_unique(df,columns):
    return {column: list(df[column].unique()) for column in columns}
def get_categorical_columns(df):
    return [column for column in df.columns if df.dtypes[column]=='object']


In [37]:
#calling the function 
get_unique(df,get_categorical_columns(df))

{'gender': ['M', 'F'],
 'NationalITy': ['KW',
  'lebanon',
  'Egypt',
  'SaudiArabia',
  'USA',
  'Jordan',
  'venzuela',
  'Iran',
  'Tunis',
  'Morocco',
  'Syria',
  'Palestine',
  'Iraq',
  'Lybia'],
 'PlaceofBirth': ['KuwaIT',
  'lebanon',
  'Egypt',
  'SaudiArabia',
  'USA',
  'Jordan',
  'venzuela',
  'Iran',
  'Tunis',
  'Morocco',
  'Syria',
  'Iraq',
  'Palestine',
  'Lybia'],
 'StageID': ['lowerlevel', 'MiddleSchool', 'HighSchool'],
 'GradeID': ['G-04',
  'G-07',
  'G-08',
  'G-06',
  'G-05',
  'G-09',
  'G-12',
  'G-11',
  'G-10',
  'G-02'],
 'SectionID': ['A', 'B', 'C'],
 'Topic': ['IT',
  'Math',
  'Arabic',
  'Science',
  'English',
  'Quran',
  'Spanish',
  'French',
  'History',
  'Biology',
  'Chemistry',
  'Geology'],
 'Semester': ['F', 'S'],
 'Relation': ['Father', 'Mum'],
 'ParentAnsweringSurvey': ['Yes', 'No'],
 'ParentschoolSatisfaction': ['Good', 'Bad'],
 'StudentAbsenceDays': ['Under-7', 'Above-7'],
 'Class': ['M', 'L', 'H']}

In [38]:
#Categorising different feature 
binary_feature=['gender','Semester','Relation','ParentAnsweringSurvey','StudentAbsenceDays','ParentschoolSatisfaction']
ordinal_feature=['StageID','GradeID']
nominal_feature=['NationalIty','PlaceofBirth','SectionID','Topic']
target_column='Class'

In [39]:
#getting positive values
binary_positive_values=['M','S','Father','Yes','Above-7','Good']
stage_ordering=['lowerlevel', 'MiddleSchool', 'HighSchool']
grade_ordering=['G-02','G-04','G-05','G-06',
  'G-07',
  'G-08',
  'G-09',  'G-10','G-11',
  'G-12',
  ]

In [40]:
#creating encoding function
def binary_encode(df,column,positive_value):
    df=df.copy()
    df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    return df
def ordinal_encode(df,column,ordering):
    df=df.copy()
    df[column]=df[column].apply(lambda x:ordering.index(x))
    return df
def onehot_encode(df,column,prefix):
    df=df.copy()
    dummies=pd.get_dummies(df[column],prefix=prefix)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [41]:
#using for loop to iterate through the column
for feature,positive_value in zip(binary_feature,binary_positive_values):
    df=binary_encode(df,feature,positive_value)

In [42]:
df

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,KW,KuwaIT,lowerlevel,G-04,A,IT,0,1,15,16,2,20,1,1,0,M
1,1,KW,KuwaIT,lowerlevel,G-04,A,IT,0,1,20,20,3,25,1,1,0,M
2,1,KW,KuwaIT,lowerlevel,G-04,A,IT,0,1,10,7,0,30,0,0,1,L
3,1,KW,KuwaIT,lowerlevel,G-04,A,IT,0,1,30,25,5,35,0,0,1,L
4,1,KW,KuwaIT,lowerlevel,G-04,A,IT,0,1,40,50,12,50,0,0,1,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0,Jordan,Jordan,MiddleSchool,G-08,A,Chemistry,1,1,5,4,5,8,0,0,1,L
476,0,Jordan,Jordan,MiddleSchool,G-08,A,Geology,0,1,50,77,14,28,0,0,0,M
477,0,Jordan,Jordan,MiddleSchool,G-08,A,Geology,1,1,55,74,25,29,0,0,0,M
478,0,Jordan,Jordan,MiddleSchool,G-08,A,History,0,1,30,17,14,57,0,0,1,L


In [43]:
#using ordinal encode to encode stageID and gradeID
df=ordinal_encode(df,'StageID',stage_ordering)
df=ordinal_encode(df,'GradeID',grade_ordering)

In [44]:
df

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,KW,KuwaIT,0,1,A,IT,0,1,15,16,2,20,1,1,0,M
1,1,KW,KuwaIT,0,1,A,IT,0,1,20,20,3,25,1,1,0,M
2,1,KW,KuwaIT,0,1,A,IT,0,1,10,7,0,30,0,0,1,L
3,1,KW,KuwaIT,0,1,A,IT,0,1,30,25,5,35,0,0,1,L
4,1,KW,KuwaIT,0,1,A,IT,0,1,40,50,12,50,0,0,1,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0,Jordan,Jordan,1,5,A,Chemistry,1,1,5,4,5,8,0,0,1,L
476,0,Jordan,Jordan,1,5,A,Geology,0,1,50,77,14,28,0,0,0,M
477,0,Jordan,Jordan,1,5,A,Geology,1,1,55,74,25,29,0,0,0,M
478,0,Jordan,Jordan,1,5,A,History,0,1,30,17,14,57,0,0,1,L


In [45]:
#Onehotencode
#creating list in which onehotencode to execute
nominal_prefixes=['N','P','S','T']
nominal_features=['NationalITy','PlaceofBirth','SectionID','Topic']

In [46]:
#using for loop through feature and prefix
for feature,prefix in zip(nominal_features,nominal_prefixes):
    df=onehot_encode(df,feature,prefix)
    

In [60]:
target_ordering=['L','M','H']
df=ordinal_encode(df,target_column,target_ordering)

In [62]:
df['Class'].unique()

array([1, 0, 2])

In [63]:
#Spliting and Scaling the data
y=df[target_column]
x=df.drop(target_column,axis=1)

In [64]:
#scaling the data
scaler=StandardScaler()
x=scaler.fit_transform(x)
x

array([[ 0.7574764 , -1.08466911, -0.77436363, ..., -0.21916874,
        -0.34479141, -0.23440362],
       [ 0.7574764 , -1.08466911, -0.77436363, ..., -0.21916874,
        -0.34479141, -0.23440362],
       [ 0.7574764 , -1.08466911, -0.77436363, ..., -0.21916874,
        -0.34479141, -0.23440362],
       ...,
       [-1.32017315,  0.5734238 ,  0.85053054, ..., -0.21916874,
        -0.34479141, -0.23440362],
       [-1.32017315,  0.5734238 ,  0.85053054, ..., -0.21916874,
        -0.34479141, -0.23440362],
       [-1.32017315,  0.5734238 ,  0.85053054, ..., -0.21916874,
        -0.34479141, -0.23440362]])

In [65]:
#spliting the data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)


In [66]:
x.shape

(480, 55)

In [70]:
#setting up tensorflow model
inputs=tf.keras.Input(shape=(55,))
x=tf.keras.layers.Dense(64,activation='relu')(inputs)
x=tf.keras.layers.Dense(64,activation='relu')(x)
outputs=tf.keras.layers.Dense(3,activation='softmax')(x)
model=tf.keras.Model(inputs=inputs,outputs=outputs)

In [74]:
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
batch_size=64
epochs=100
model.fit(x_train,y_train,validation_split=0.2,
                  batch_size=batch_size,
                 epochs=epochs,verbose=0)

<keras.callbacks.History at 0x7f589c569390>

In [75]:
model.evaluate(x_test,y_test)



[1.5034328699111938, 0.7361111044883728]