## Import required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Understanding the data

In [None]:
df = pd.read_csv('/content/StudentPerformanceFactors.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#Checking missing values
df.isnull().sum()

Observations:
- Teacher Quality -> 78
- Parental Education Level -> 90
- Distance From Home -> 67

In [None]:
# Checking Duplicates
df.duplicated().sum()

Observations:
- No duplicates are present

In [None]:
# Checking data types
df.info()

In [None]:
# Checking number of unique values in each column
df.nunique()

Observations:

Most unique values are contained in:
- Hours Studied
- Attendance
- Previous Scores
- Exam Scores

In [None]:
# Statistis of a dataset
df.describe()

(Measures of Central Tendency)
- Hours Studied: mean, median ~ 20 hours, most students study around 20 hours
- Attendace: mean, median ~ 80%, most students have good attendace
- Sleep Hours: mean, median ~ 7 hours, most students sleep 7 hours on avg
- Previous Scores: mean, median ~ 75, min ~ 50, student performance is decent at worst
- Physical Activity: mean, median ~3, most students are decently active
- Exam Scores: mean, median = 67, scores are centered around 67

(Measures of Dispersion)
- Hours Studied: std ~ 6, range: 1-44, High Variability
- Attendance: Std ~ 11.5, range: 60-100, Few students have low attendance
- Sleep Hours: Std ~ 1.4, range: 4-10, some students tend to get less hours of sleep
- Previous Scores: Std ≈ 14.4, Range: 50–100 → Wide range in past scores.
- Tutoring Sessions: Std ≈ 1.23, Range: 0–8 → Most students attend few sessions; some take many.
- Physical Activity: Std ≈ 1.03, Range: 0–6 → Most students do 2–4 hours of activity.
- Exam Score: Std ≈ 3.89, Range 55–101 → Exam scores are relatively concentrated but outliers exist

Observations:
Hours Studied, Previous Scores, Tutoring Sessions show greater variability


## Exploring Data

In [None]:
print("Categories in 'Parental_Involvement' variable:", end = " ")
print(df['Parental_Involvement'].unique())

print("Categories in 'Access_to_Resources' variable:", end = " ")
print(df['Access_to_Resources'].unique())

print("Categories in 'Extracurricular_Activities' variable:", end = " ")
print(df['Extracurricular_Activities'].unique())

print("Categories in 'Motivation_Level' variable:", end = " ")
print(df['Motivation_Level'].unique())

print("Categories in 'Internet_Access' variable:", end = " ")
print(df['Internet_Access'].unique())

print("Categories in 'Family_Income' variable:", end = " ")
print(df['Family_Income'].unique())

print("Categories in 'Teacher_Quality' variable:", end = " ")
print(df['Teacher_Quality'].unique())

print("Categories in 'Peer_Influence' variable:", end = " ")
print(df['Peer_Influence'].unique())

print("Categories in 'Learning_Disabilities' variable:", end = " ")
print(df['Learning_Disabilities'].unique())

print("Categories in 'Parental_Education_Level' variable:", end = " ")
print(df['Parental_Education_Level'].unique())

print("Categories in 'Distance_from_Home' variable:", end = " ")
print(df['Distance_from_Home'].unique())

print("Categories in 'Gender' variable:", end = " ")
print(df['Gender'].unique())

In [None]:
#defining numerical and categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print("We have {} numeric features: {}".format(len(numeric_features), numeric_features))
print("We have {} categorical features: {}".format(len(categorical_features), categorical_features))

## Dealing with missing values

In [None]:
null_columns = df.columns[df.isnull().sum() > 0].tolist()
print(null_columns)

From previous observations:
- Teacher Quality -> 78, Categorical
- Parental Education Level -> 90, Categorical
- Distance From Home -> 67, Categorical

As all features with null values are categorical, and only make up a small fraction of the dataset, they can be filled with the most recurring values



In [None]:
df['Parental_Education_Level'].mode()[0]

In [None]:
for col in df.columns:
  if df[col].isnull().sum() > 0:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace = True)

In [None]:
df.isnull().sum()

## Encoding Categorical Columns

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Extracurricular_Activities_encoded'] = le.fit_transform(df['Extracurricular_Activities'])
df['Internet_Access_encoded'] = le.fit_transform(df['Internet_Access'])
df['Learning_Disabilities_encoded'] = le.fit_transform(df['Learning_Disabilities'])
df['Gender_encoded'] = le.fit_transform(df['Gender'])
df['School_Type_encoded'] = le.fit_transform(df['School_Type'])


drop_features = ['Extracurricular_Activities', 'Internet_Access', 'Learning_Disabilities', 'Gender', 'School_Type' ]
df.drop(drop_features, axis=1, inplace = True)


In [None]:
# Ordinal Encoding
Parental_Involvement_map = {'Low':0, 'Medium':1, 'High':2}
df['Parental_Involvement_Encode'] = df['Parental_Involvement'].map(Parental_Involvement_map)

Access_to_Resources_map = {'Low':0, 'Medium':1, 'High':2}
df['Access_to_Resources_Encode'] = df['Access_to_Resources'].map(Access_to_Resources_map)

Motivation_Level_map = {'Low':0, 'Medium':1, 'High':2}
df['Motivation_Level_Encode'] = df['Motivation_Level'].map(Motivation_Level_map)

Family_Income_map = {'Low':0, 'Medium':1, 'High':2}
df['Family_Income_Encode'] = df['Family_Income'].map(Family_Income_map)

Teacher_Quality_map = {'Low':0, 'Medium':1, 'High':2}
df['Teacher_Quality_Encode'] = df['Teacher_Quality'].map(Teacher_Quality_map)

Peer_Influence_map = {'Neutral':0, 'Negative':1, 'Positive':2}
df['Peer_Influence_Encode'] = df['Peer_Influence'].map(Peer_Influence_map)

Distance_from_Home_map = {'Near':0, 'Moderate':1, 'Far':2}
df['Distance_from_Home'] = df['Distance_from_Home'].map(Distance_from_Home_map)

Parental_Education_Level_map = {'High School':0, 'College':1, 'Postgraduate':2}
df['Parental_Education_Level'] = df['Parental_Education_Level'].map(Parental_Education_Level_map)

drop_features_ord = ['Parental_Involvement', 'Access_to_Resources', 'Motivation_Level', 'Family_Income', 'Teacher_Quality', 'Peer_Influence', 'Distance_from_Home', 'Parental_Education_Level' ]
df.drop(drop_features_ord, axis=1, inplace = True)


In [None]:
df.head()

## Data Visualizations

In [None]:
# plotting a correlation graph
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot = True, cmap = "coolwarm", fmt = ".2f")
plt.title("Correlation Matrix")
plt.show()

Observations:
-

In [None]:
target_corr = corr_matrix['Exam_Score'].drop('Exam_Score')
target_corr

Using a threshold of 0.1 to find what features to focus on:
- Hours Studied
- Attendance
- Previous_Scores
- Tutoring_Sessions
- Parental_Involvement_Encode
- Access_to_Resources_Encode
- Family_Income_Encode
- Learning_Disabilities_encoded

In [None]:
# Histogram & KDE
plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
sns.histplot(data = df, x = 'Exam_Score', bins = 30, kde = True, color = 'g')
plt.title("Exam Score Distribution")

plt.subplot(1, 2, 2)
sns.histplot(data = df, x = 'Exam_Score', bins = 30, kde = True,  hue='Tutoring_Sessions', multiple="stack")
plt.title("Exam Score by Tutoring Sessions")

plt.tight_layout()
plt.show()

Observations:
- Most students have scored between 60 and 70

Most students cluster near the average, with fewer at the very low or very high end

Scores are fairly normally distributed; no extreme skewness. Most students perform within a narrow band

- Students with more tutoring sessions (higher numbers) have exam scores slightly shifted to the right
- Students with 0 sessions cluster slightly left

In [None]:
plt.figure(figsize=(10, 6))

plt.subplot(1, 2, 1)
sns.histplot(data = df, x = 'Exam_Score', bins = 30, kde = True, hue = 'Parental_Involvement_Encode')
plt.title("Exam Score Distribution by Parental Involvement")

plt.subplot(1, 2, 2)
sns.histplot(data = df, x = 'Exam_Score', bins = 30, kde = True,  hue='Learning_Disabilities_encoded', multiple="stack")
plt.title("Exam Score by Learning Disabilities")

plt.tight_layout()
plt.show()

- Students with high parental involvement (2) tend to score higher, with their curve shifted slightly right.
- Low parental involvement (0) has a curve shifted left (lower average scores).
- Students without disabilities (blue) score higher and are more concentrated around the mid-to-high 70s
- Students with disabilities (orange) tend to score lower and are spread out more towards the lower 60s

In [None]:
plt.figure(figsize=(15, 15))
for i, col in enumerate(numeric_features, 1):
  plt.subplot(3, 3, i)
  sns.boxplot(df[col],
              color = '#ff4252',
              )
  plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
# numeric_features = df.select_dtypes(include = np.number).columns
# data_numeric = df[numeric_features]

# z = np.abs((data_numeric - data_numeric.mean())/ data_numeric.std())
# threshold = 3
# df_clean = df[(z < threshold).all(axis=1)]


# print(f"Original rows: {df.shape[0]}")
# print(f"Rows after outlier removal: {df_clean.shape[0]}")
# print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")

# df = df_clean