# Academic Performance Prediction

Dataset: `academic_performance_dataset.csv`


## 1. Imports & Setup

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, mean_absolute_error, mean_squared_error, r2_score


## 2. Load & Preview Data

In [25]:
# Load dataset
df = pd.read_csv('academic_performance_dataset.csv')
# print shape of the datase
print("Dataset shape:", df.shape)
#print the head
df.head()

Dataset shape: (1000, 11)


Unnamed: 0,age,study_hours,attendance_rate,assignments_submitted,parental_support,internet_access,gender,school_type,extracurricular_hours,sleep_hours,final_grade
0,19.0,12.606539,99.932696,0.0,1.0,1.0,Female,Private,4.675619,9.71085,4.656857
1,15.0,1.081419,97.223448,,1.0,1.0,Female,Private,6.494556,6.488781,1.463881
2,17.0,14.880645,50.13355,1.0,4.0,0.0,Male,Private,0.48431,9.298099,13.195475
3,18.0,11.292144,89.768916,9.0,,,Male,Public,,6.689408,24.201369
4,18.0,23.444111,67.680264,4.0,5.0,0.0,Female,Public,8.276423,9.200973,20.357069


## 3. Exploratory Data Analysis (EDA)

In [26]:
# Exploratory Data Analysis
print(df.info())
print(df.describe())
# Missing data visualization

print("\nMissing values:")
print(df.isnull().sum())

# Basic distributions
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"\nBasic stats:")
print(df[numeric_cols].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    912 non-null    float64
 1   study_hours            889 non-null    float64
 2   attendance_rate        888 non-null    float64
 3   assignments_submitted  899 non-null    float64
 4   parental_support       902 non-null    float64
 5   internet_access        899 non-null    float64
 6   gender                 903 non-null    object 
 7   school_type            894 non-null    object 
 8   extracurricular_hours  898 non-null    float64
 9   sleep_hours            903 non-null    float64
 10  final_grade            1000 non-null   float64
dtypes: float64(9), object(2)
memory usage: 86.1+ KB
None
              age  study_hours  attendance_rate  assignments_submitted  \
count  912.000000   889.000000       888.000000             899.000000   
mean    17.6

## 4. Data Cleaning & Preprocessing

In [19]:
# before defining pipeline we have first to separate features and target
X = df.drop('final_grade', axis=1)
y = df['final_grade']
# then after is to identify numeric and categorical columns  
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
# Define preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


## 5. Modeling & Evaluation

In [24]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# train model
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

# evaluation
print(f"\nModel Performance:")
print(f"Grade predictions/RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")


Model Performance:
Grade predictions/RMSE: 5.16
R^2 Score: 0.28
