## Imports

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub


import os

from sklearn.model_selection import train_test_split


# Downland Dataset

In [68]:
# Download latest version
path = kagglehub.dataset_download("kundanbedmutha/exam-score-prediction-dataset")

PATH = os.path.join(path, 'Exam_Score_Prediction.csv')
print("Path to dataset files:", PATH)

Path to dataset files: /home/franio/.cache/kagglehub/datasets/kundanbedmutha/exam-score-prediction-dataset/versions/2/Exam_Score_Prediction.csv


# Getting Data, tiding up data

In [69]:
data = pd.read_csv(PATH)
data.drop('student_id', axis=1, inplace=True) # We do not need it
data.drop(data[data['gender'] == 'other'].index, inplace=True)

data

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
2,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
4,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7
5,23,male,b.tech,3.48,65.4,yes,4.2,good,mixed,low,moderate,58.2
6,17,female,b.tech,1.35,69.0,yes,7.4,average,online videos,high,hard,53.7
...,...,...,...,...,...,...,...,...,...,...,...,...
19992,17,male,b.com,5.74,81.4,yes,7.5,good,online videos,high,hard,95.0
19993,19,female,ba,3.28,73.3,yes,9.5,good,group study,high,easy,93.9
19996,18,male,b.com,3.71,41.6,no,5.9,average,coaching,medium,moderate,60.9
19998,19,male,bba,4.60,76.3,no,6.1,good,self-study,medium,moderate,79.0


# Feature Engineering
Some data is qualitative, so we need to make it quantitative

## Qualitative data

In [70]:
qualitative_features_name = ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
unique_qualitative_features = {}

for feature in qualitative_features_name:
    unique_qualitative_features[feature] = data[feature].unique()

unique_qualitative_features = pd.DataFrame.from_dict(unique_qualitative_features, orient='index')
unique_qualitative_features

Unnamed: 0,0,1,2,3,4,5,6
gender,male,female,,,,,
course,diploma,b.sc,b.tech,bba,ba,b.com,bca
internet_access,yes,no,,,,,
sleep_quality,poor,good,average,,,,
study_method,coaching,mixed,online videos,self-study,group study,,
facility_rating,low,high,medium,,,,
exam_difficulty,hard,moderate,easy,,,,


## Feature Engineering

In [71]:
data_copy = data.copy()

# Binary Features
data_copy['gender'] = data['gender'].map({'male': 1, 'female': 0})
data_copy['internet_access'] = data['internet_access'].map({'yes': 1, 'no': 0})

# Nominal Features
data_copy['sleep_quality'] = data['sleep_quality'].map({'poor':0, 'average':1, 'good':2})
data_copy['facility_rating'] = data['facility_rating'].map({'low':0, 'medium':1, 'high':2})
data_copy['exam_difficulty'] = data['exam_difficulty'].map({'easy':0, 'moderate':1, 'hard':2})



data_copy

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,17,1,diploma,2.78,92.9,1,7.4,0,coaching,0,2,58.9
2,22,1,b.sc,7.88,76.8,1,8.5,0,coaching,2,1,90.3
4,20,0,diploma,0.89,71.6,1,9.8,0,coaching,0,1,43.7
5,23,1,b.tech,3.48,65.4,1,4.2,2,mixed,0,1,58.2
6,17,0,b.tech,1.35,69.0,1,7.4,1,online videos,2,2,53.7
...,...,...,...,...,...,...,...,...,...,...,...,...
19992,17,1,b.com,5.74,81.4,1,7.5,2,online videos,2,2,95.0
19993,19,0,ba,3.28,73.3,1,9.5,2,group study,2,0,93.9
19996,18,1,b.com,3.71,41.6,0,5.9,1,coaching,1,1,60.9
19998,19,1,bba,4.60,76.3,0,6.1,2,self-study,1,1,79.0


# Data Split

In [72]:
y = data_copy['exam_score']
X = data_copy.drop('exam_score', axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

x_train

Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty
1747,20,1,b.sc,5.57,43.4,0,5.4,1,self-study,2,0
2351,23,0,diploma,3.52,61.1,1,4.6,0,coaching,2,1
7551,20,0,b.tech,1.83,58.6,1,9.3,1,group study,1,1
13665,19,1,b.tech,7.17,44.1,1,4.5,0,coaching,1,2
10989,23,1,b.tech,5.06,86.1,1,8.7,0,coaching,2,0
...,...,...,...,...,...,...,...,...,...,...,...
18021,20,1,bca,0.39,58.0,1,6.0,2,online videos,0,1
7776,23,0,b.tech,2.44,74.7,0,5.1,2,self-study,2,1
8106,20,0,bba,0.90,92.3,0,9.3,2,group study,2,2
1304,21,1,bba,7.51,72.8,0,4.7,0,coaching,2,0


# Data Exploration and Visualisations

## Target Data

In [None]:
plt.hist(y_train)
plt.grid(False)
plt.title('Histogram of exam score')
plt.ylabel('Number of Student')
plt.xlabel('Exam Score')
plt.show()

## Correlation matrix

In [None]:
names = x_train.columns
df_features = pd.DataFrame(x_train, columns=names)

corr_mtx = df_features.corr(method='pearson')
plt.matshow(corr_mtx, cmap="RdBu_r")
plt.colorbar()

plt.xticks(range(len(names)), names, rotation=90)
plt.yticks(range(len(names)), names)
plt.grid(False)
plt.show()