Predict students’ exam scores using academic habits, lifestyle factors, and learning resources.
Submissions are evaluated using Root Mean Squared Error (RMSE).

Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor


Load the Dataset

In [2]:
train_df = pd.read_csv("/kaggle/input/exam-score-prediction/train.csv")
test_df  = pd.read_csv("/kaggle/input/exam-score-prediction/test.csv")

train_df.head()


Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


Basic Data Check

In [3]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.info()


Train shape: (630000, 13)
Test shape: (270000, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                630000 non-null  int64  
 1   age               630000 non-null  int64  
 2   gender            630000 non-null  object 
 3   course            630000 non-null  object 
 4   study_hours       630000 non-null  float64
 5   class_attendance  630000 non-null  float64
 6   internet_access   630000 non-null  object 
 7   sleep_hours       630000 non-null  float64
 8   sleep_quality     630000 non-null  object 
 9   study_method      630000 non-null  object 
 10  facility_rating   630000 non-null  object 
 11  exam_difficulty   630000 non-null  object 
 12  exam_score        630000 non-null  float64
dtypes: float64(4), int64(2), object(7)
memory usage: 62.5+ MB


Separate Features and Target

In [4]:
X = train_df.drop(columns=["exam_score"])
y = train_df["exam_score"]


Train–Validation Split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Preprocessing Setup

In [6]:
numeric_features = [
    "age", "study_hours", "class_attendance", "sleep_hours"
]

categorical_features = [
    "gender", "course", "internet_access",
    "sleep_quality", "study_method",
    "facility_rating", "exam_difficulty"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


Model Selection

Use Random Forest Regressor as a strong baseline model.

In [7]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)


Build Pipeline

In [8]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)


Train the Model

In [9]:
pipeline.fit(X_train, y_train)


Validate Using RMSE

In [10]:
val_preds = pipeline.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))

print("Validation RMSE:", rmse)

#Lower RMSE = better model

Validation RMSE: 9.018192594659899


Train on Full Data

In [11]:
pipeline.fit(X, y)


Predict on Test Set

In [12]:
test_ids = test_df["id"]
test_preds = pipeline.predict(test_df)


Submission File

In [13]:
submission = pd.DataFrame({
    "id": test_ids,
    "exam_score": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,exam_score
0,630000,68.241174
1,630001,71.499445
2,630002,85.05013
3,630003,53.161024
4,630004,48.491653
