<a href="https://colab.research.google.com/github/vermanaman419-prog/Predicting-Student-Test-Scores-With-The-Help-Of-ML/blob/main/Predicting_Student_Test_Scores_With_The_Help_Of_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Student Test Scores With The Help Of ML



Setup - Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor


ðŸ”¹ STEP 1: Loading the data

In [2]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print(train.shape)
print(test.shape)

train.head()


(630000, 13)
(270000, 12)


Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


ðŸ”¹ STEP 2: Identifying target & features

In [3]:
TARGET = "exam_score"

X = train.drop(columns=[TARGET])
y = train[TARGET]


ðŸ”¹ STEP 3: Identifying numeric & categorical columns

In [4]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_features, categorical_features


(Index(['id', 'age', 'study_hours', 'class_attendance', 'sleep_hours'], dtype='object'),
 Index(['gender', 'course', 'internet_access', 'sleep_quality', 'study_method',
        'facility_rating', 'exam_difficulty'],
       dtype='object'))

ðŸ”¹ STEP 4: Preprocessing pipeline

In [7]:
numeric_transformer = "passthrough"

categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


ðŸ”¹ STEP 5: Model (Strong baseline)

In [8]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)


ðŸ”¹ STEP 6: Full pipeline (preprocessing + model)

In [9]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)


ðŸ”¹ STEP 7: Trainâ€“validation split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ðŸ”¹ STEP 8: Train the model

In [11]:
pipeline.fit(X_train, y_train)


ðŸ”¹ STEP 9: Validate using RMSE (competition metric)

In [14]:
val_preds = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
rmse

np.float64(9.129401230849538)

ðŸ”¹ STEP 10: Train on full data

In [15]:
pipeline.fit(X, y)

ðŸ”¹ STEP 11: Predict on test set

In [16]:
test_preds = pipeline.predict(test)

ðŸ”¹ STEP 12: Creating file (csv)

In [17]:
submission = pd.DataFrame({
    "id": test["id"],
    "exam_score": test_preds
})

submission.head()

Unnamed: 0,id,exam_score
0,630000,70.300805
1,630001,70.639795
2,630002,88.539982
3,630003,49.553807
4,630004,46.96551


ðŸ”¹ STEP 13: Saving CSV

In [18]:
submission.to_csv("submission.csv", index=False)

In [19]:
import joblib

joblib.dump(pipeline, "student_score_model.joblib")


['student_score_model.joblib']