# Student Performance Prediction

## Exploring datasets

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [2]:
from opendatasets import download_kaggle_dataset

In [3]:
download_kaggle_dataset(dataset_url='https://www.kaggle.com/datasets/souradippal/student-performance-prediction',
                        data_dir='.')

Skipping, found downloaded files in ".\student-performance-prediction" (use force=True to force download)


In [4]:
import pandas as pd

In [5]:
raw_df = pd.read_csv('student-performance-prediction/student_performance_prediction.csv')


In [6]:
raw_df

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,,75.0,Yes,Master,Yes
1,S00002,9.3,95.3,60.6,No,High School,No
2,S00003,13.2,,64.0,No,Associate,No
3,S00004,17.6,76.8,62.4,Yes,Bachelor,No
4,S00005,8.8,89.3,72.7,No,Master,No
...,...,...,...,...,...,...,...
39995,S39996,15.6,93.8,51.4,Yes,Master,No
39996,S39997,11.3,66.4,64.2,No,Doctorate,Yes
39997,S39998,13.1,65.6,38.1,No,Bachelor,No
39998,S39999,14.1,74.9,,Yes,Master,No


In [7]:
raw_df['Passed'].value_counts(dropna=False)

Passed
Yes    19011
No     18989
NaN     2000
Name: count, dtype: int64

In [8]:
raw_df.dropna(subset=['Passed'], inplace=True)
raw_df

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
0,S00001,12.5,,75.0,Yes,Master,Yes
1,S00002,9.3,95.3,60.6,No,High School,No
2,S00003,13.2,,64.0,No,Associate,No
3,S00004,17.6,76.8,62.4,Yes,Bachelor,No
4,S00005,8.8,89.3,72.7,No,Master,No
...,...,...,...,...,...,...,...
39995,S39996,15.6,93.8,51.4,Yes,Master,No
39996,S39997,11.3,66.4,64.2,No,Doctorate,Yes
39997,S39998,13.1,65.6,38.1,No,Bachelor,No
39998,S39999,14.1,74.9,,Yes,Master,No


In [9]:
raw_df.isna().sum()

Student ID                                        0
Study Hours per Week                           1905
Attendance Rate                                1888
Previous Grades                                1877
Participation in Extracurricular Activities    1902
Parent Education Level                         1898
Passed                                            0
dtype: int64

### Study Hours per Week

In [10]:
raw_df['Study Hours per Week'].describe()

count    36095.000000
mean         9.963665
std          5.031397
min        -12.300000
25%          6.600000
50%         10.000000
75%         13.400000
max         32.400000
Name: Study Hours per Week, dtype: float64

In [11]:
import numpy as np

In [12]:
raw_df['Study Hours per Week'] = raw_df['Study Hours per Week'].apply(lambda x: x if x>=0 else np.nan)

In [13]:
raw_df['Study Hours per Week'].isna().sum()

2803

### Attendance Rate

In [14]:
raw_df['Attendance Rate'].describe()

count    36112.000000
mean        75.269913
std         20.426990
min        -14.300000
25%         61.500000
50%         75.200000
75%         88.800000
max        150.200000
Name: Attendance Rate, dtype: float64

In [15]:
raw_df['Attendance Rate'] = raw_df['Attendance Rate'].apply(lambda x: x if (x>=0 and x<=100) else np.nan)

In [16]:
raw_df['Attendance Rate'].describe()

count    32169.000000
mean        70.954145
std         16.820118
min          0.500000
25%         59.700000
50%         72.400000
75%         83.900000
max        100.000000
Name: Attendance Rate, dtype: float64

### Previous Grades

In [17]:
raw_df['Previous Grades'].describe()

count    36123.000000
mean        65.447142
std         16.531608
min          8.300000
25%         55.100000
50%         65.200000
75%         75.200000
max        200.000000
Name: Previous Grades, dtype: float64

In [18]:
raw_df['Previous Grades'] = raw_df['Previous Grades'].apply(lambda x: x if x<=100 else np.nan)

In [19]:
raw_df['Previous Grades'].describe()

count    35660.000000
mean        64.672925
std         14.509373
min          8.300000
25%         54.900000
50%         64.900000
75%         74.700000
max        100.000000
Name: Previous Grades, dtype: float64

### Participation in Extracurricular Activities

In [20]:
raw_df['Participation in Extracurricular Activities'].value_counts(dropna=False)

Participation in Extracurricular Activities
No     18066
Yes    18032
NaN     1902
Name: count, dtype: int64

In [21]:
total = raw_df['Participation in Extracurricular Activities'].value_counts()
yes = raw_df[raw_df['Passed']=='Yes']['Participation in Extracurricular Activities'].value_counts()

In [22]:
raw_df.drop(columns='Participation in Extracurricular Activities', inplace=True)
raw_df

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Parent Education Level,Passed
0,S00001,12.5,,75.0,Master,Yes
1,S00002,9.3,95.3,60.6,High School,No
2,S00003,13.2,,64.0,Associate,No
3,S00004,17.6,76.8,62.4,Bachelor,No
4,S00005,8.8,89.3,72.7,Master,No
...,...,...,...,...,...,...
39995,S39996,15.6,93.8,51.4,Master,No
39996,S39997,11.3,66.4,64.2,Doctorate,Yes
39997,S39998,13.1,65.6,38.1,Bachelor,No
39998,S39999,14.1,74.9,,Master,No


### Parent Education Level

In [23]:
raw_df['Parent Education Level'].value_counts(dropna=False)

Parent Education Level
Bachelor       7290
High School    7264
Doctorate      7250
Associate      7241
Master         7057
NaN            1898
Name: count, dtype: int64

In [24]:
total = raw_df['Parent Education Level'].value_counts()
yes = raw_df[raw_df['Passed']=='Yes']['Parent Education Level'].value_counts()

In [25]:
yes/total*100

Parent Education Level
Associate      49.233531
Bachelor       50.150892
Doctorate      50.800000
High School    50.151432
Master         49.695338
Name: count, dtype: float64

In [26]:
raw_df.drop(columns='Parent Education Level', inplace=True)
raw_df

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Passed
0,S00001,12.5,,75.0,Yes
1,S00002,9.3,95.3,60.6,No
2,S00003,13.2,,64.0,No
3,S00004,17.6,76.8,62.4,No
4,S00005,8.8,89.3,72.7,No
...,...,...,...,...,...
39995,S39996,15.6,93.8,51.4,No
39996,S39997,11.3,66.4,64.2,Yes
39997,S39998,13.1,65.6,38.1,No
39998,S39999,14.1,74.9,,No


In [27]:
raw_df.drop(columns='Student ID', inplace=True)
raw_df

Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades,Passed
0,12.5,,75.0,Yes
1,9.3,95.3,60.6,No
2,13.2,,64.0,No
3,17.6,76.8,62.4,No
4,8.8,89.3,72.7,No
...,...,...,...,...
39995,15.6,93.8,51.4,No
39996,11.3,66.4,64.2,Yes
39997,13.1,65.6,38.1,No
39998,14.1,74.9,,No


## Preprocessing

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [29]:
numeric_cols = raw_df.select_dtypes(include=np.number).columns.tolist()
numeric_cols

['Study Hours per Week', 'Attendance Rate', 'Previous Grades']

In [30]:
imputer = SimpleImputer()

In [31]:
imputer.fit(raw_df[numeric_cols])

In [32]:
raw_df[numeric_cols] = imputer.transform(raw_df[numeric_cols])

In [33]:
raw_df.isna().sum()

Study Hours per Week    0
Attendance Rate         0
Previous Grades         0
Passed                  0
dtype: int64

In [34]:
scaler = StandardScaler()

In [35]:
scaler.fit(raw_df[numeric_cols])

In [36]:
raw_df[numeric_cols] = scaler.transform(raw_df[numeric_cols])

In [37]:
raw_df[numeric_cols].describe()

Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades
count,38000.0,38000.0,38000.0
mean,-2.9917590000000004e-17,2.185854e-16,-4.4502410000000006e-17
std,1.000013,1.000013,1.000013
min,-2.263491,-4.552577,-4.010783
25%,-0.6980721,-0.55921,-0.6383996
50%,0.0,0.0,0.0
75%,0.6468653,0.6943712,0.6635964
max,4.880111,1.876873,2.513427


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
inputs = raw_df[numeric_cols]
targets = raw_df['Passed']

In [40]:
targets = targets.apply(lambda x: 1 if x=='Yes' else 0)

In [41]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, targets,
                                                                        test_size=0.2,
                                                                        random_state=42)

In [42]:
train_inputs.shape, val_inputs.shape

((30400, 3), (7600, 3))

## Model

### ML Model

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [44]:
classifier = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'XGB': XGBClassifier()
}
result = {'Classifier': [], 'Accurary': []}

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
for clf_name, clf in classifier.items():
    clf.fit(train_inputs, train_targets)
    preds = clf.predict(val_inputs)
    acc = accuracy_score(val_targets, preds)
    result['Classifier'].append(clf_name)
    result['Accurary'].append(acc)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,Classifier,Accurary
0,LogisticRegression,0.491842
1,DecisionTree,0.506053
2,RandomForest,0.503816
3,XGB,0.498026


In [49]:
def test_params(**params):
    model = RandomForestClassifier(**params)
    model.fit(train_inputs, train_targets)
    train_acc = model.score(train_inputs, train_targets)
    val_acc = model.score(val_inputs, val_targets)
    return train_acc, val_acc

In [50]:
test_params(n_jobs=-1)

(0.9949342105263158, 0.5011842105263158)

In [51]:
no_preds = pd.Series(np.zeros(len(val_targets)))
no_preds

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7595    0.0
7596    0.0
7597    0.0
7598    0.0
7599    0.0
Length: 7600, dtype: float64

In [52]:
accuracy_score(val_targets, no_preds)

0.5051315789473684

In [53]:
for i in [5,10,30,50,100,200,300,600]:
    train_acc, val_acc = test_params(n_jobs=-1, n_estimators=i)
    print("Test {}: train_acc = {:6f}, val_acc = {:6f}".format(i, train_acc, val_acc))

Test 5: train_acc = 0.941941, val_acc = 0.492895
Test 10: train_acc = 0.972829, val_acc = 0.496447
Test 30: train_acc = 0.993980, val_acc = 0.503158
Test 50: train_acc = 0.994803, val_acc = 0.500921
Test 100: train_acc = 0.994934, val_acc = 0.497500
Test 200: train_acc = 0.994934, val_acc = 0.499079
Test 300: train_acc = 0.994934, val_acc = 0.502105
Test 600: train_acc = 0.994934, val_acc = 0.500000


### Feed Forward NN Model

In [54]:
import torch.nn as nn
import torch.nn.functional as F
import torch

In [55]:
def accuracy(output, target):
    _, pred = torch.max(output, dim=1)
    return torch.tensor(torch.sum(pred == target).item()/len(target))

In [56]:
class BaseModel(nn.Module):
    def training_step(self, batch):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs[:,0])
        loss = F.binary_cross_entropy(probs, targets)
        return loss

    def validation_step(self, batch):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs[:,0])
        loss = F.binary_cross_entropy(probs, targets)
        preds = (probs > 0.5).int()
        acc = accuracy_score(targets, preds)
        return {'val_loss': loss.item(), 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = np.mean(batch_losses)
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = np.mean(batch_accs)
        return {'val_loss': epoch_loss, 'val_acc': epoch_acc}

    def epoch_end(self, epoch, result):
        print('Epoch {}, val_loss: {:4f}, val_acc: {:4f}'.format(epoch,
                                                                 result['val_loss'],
                                                                 result['val_acc']))

In [57]:
input_tensors = torch.tensor(inputs.values, dtype=torch.float32)
target_tensors = torch.tensor(targets.values, dtype=torch.float32)

In [58]:
input_tensors.shape, target_tensors.shape

(torch.Size([38000, 3]), torch.Size([38000]))

In [59]:
from torch.utils.data import TensorDataset, DataLoader
raw_ds = TensorDataset(input_tensors, target_tensors)
raw_ds[0]

(tensor([0.4925, 0.0000, 0.7347]), tensor(1.))

In [60]:
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x29c19399db0>

In [61]:
val_size = 7600
train_size = len(raw_ds) - val_size
from torch.utils.data import random_split
train_ds, val_ds = random_split(raw_ds, [train_size, val_size])

In [63]:
len(train_ds), len(val_ds)

(30400, 7600)

In [64]:
batch_size = 32

In [65]:
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size*2)

In [66]:
class TwoLayerFeedForwardModel(BaseModel):
    def __init__(self) -> None:
        super().__init__()
        self.linear1 = nn.Linear(3, 10)
        self.linear2 = nn.Linear(10, 1)
    
    def forward(self, xb):
        out = self.linear1(xb)
        out = F.relu(out)
        out = self.linear2(out)
        return out

In [67]:
two_layer_model = TwoLayerFeedForwardModel()

In [68]:
def evaluate(model: BaseModel, val_dl: DataLoader):
    outputs = [model.validation_step(batch) for batch in val_dl]
    return model.validation_epoch_end(outputs)

In [69]:
def fit(epochs: int, lr: float, model: BaseModel, 
        train_dl: DataLoader, val_dl: DataLoader, opt_func=torch.optim.SGD):
    history = []
    opt = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        for batch in train_dl:
            loss = model.training_step(batch)
            loss.backward()
            opt.step()
            opt.zero_grad()
        result = evaluate(model, val_dl)
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [70]:
history = [evaluate(two_layer_model, val_dl)]
history

[{'val_loss': 0.7059972922341162, 'val_acc': 0.5046393557422969}]

In [71]:
history += fit(5, 0.1, two_layer_model, train_dl, val_dl)

Epoch 0, val_loss: 0.693577, val_acc: 0.503501
Epoch 1, val_loss: 0.693465, val_acc: 0.501094
Epoch 2, val_loss: 0.693212, val_acc: 0.503720
Epoch 3, val_loss: 0.693545, val_acc: 0.500744
Epoch 4, val_loss: 0.693301, val_acc: 0.501488


In [72]:
history += fit(5, 0.05, two_layer_model, train_dl, val_dl)

Epoch 0, val_loss: 0.693252, val_acc: 0.501532
Epoch 1, val_loss: 0.693693, val_acc: 0.500963
Epoch 2, val_loss: 0.693231, val_acc: 0.493479
Epoch 3, val_loss: 0.693243, val_acc: 0.504289
Epoch 4, val_loss: 0.693295, val_acc: 0.500525
