In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
df = pd.read_csv("../data/raw/studentsPerformance.csv")
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df['total_score'] = df['math score'] + df['reading score'] + df['writing score']
df['average_score'] = df['total_score'] / 3

In [11]:
X = df.drop(
    columns=[
        "math score",
        "reading score",
        "writing score",
        "average_score",
        "total_score"
    ]
)

y = df["total_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [12]:
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(exclude="object").columns.tolist()

categorical_cols, numerical_cols


(['gender',
  'race/ethnicity',
  'parental level of education',
  'lunch',
  'test preparation course'],
 [])

In [13]:
categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

numerical_transformer = StandardScaler()


In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", numerical_transformer, numerical_cols)
    ],
    remainder="drop"
)


In [15]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape


((800, 17), (200, 17))

In [16]:
import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")


['../models/preprocessor.pkl']

## Feature Engineering Summary

- Target variable: total_score
- Dropped score-related columns to prevent data leakage
- Applied OneHotEncoding to categorical features
- Built reusable preprocessing pipeline using ColumnTransformer
- Saved preprocessor using joblib
