# FEATURE ENGINEERING

In [1]:
import pandas as pd

df = pd.read_csv('../data/interim/data_cleaned.csv')

In [2]:
df = pd.read_csv('../data/interim/data_cleaned.csv')

#### Drop irrelevant columns

In [3]:
drop_cols = [
    'age',
    'gender',
    'internet_access',
    'course',
    'exam_difficulty'
]

df_fe = df.drop(columns=drop_cols)


#### Define target and feature groups

In [4]:
target = 'exam_score'

num_features = [
    'study_hours',
    'class_attendance',
    'sleep_hours'
]

cat_onehot = [
    'study_method'
]

cat_ordinal = [
    'sleep_quality',
    'facility_rating'
]


#### Create new features

In [5]:
df_fe['study_attend_product'] = df_fe['study_hours'] * df_fe['class_attendance']
df_fe['sleep_deficit'] = 8 - df_fe['sleep_hours']
df_fe['study_hours_sq'] = df_fe['study_hours'] ** 2
df_fe['sleep_attend_product'] = df_fe['sleep_hours'] * df_fe['class_attendance']

num_features = [
    'study_hours',
    'class_attendance',
    'sleep_hours',
    'study_attend_product',
    'sleep_deficit',
    'study_hours_sq',
    'sleep_attend_product'
]

df_fe[num_features + cat_onehot + cat_ordinal].head()

Unnamed: 0,study_hours,class_attendance,sleep_hours,study_attend_product,sleep_deficit,study_hours_sq,sleep_attend_product,study_method,sleep_quality,facility_rating
0,7.91,98.8,4.9,781.508,3.1,62.5681,484.12,online videos,average,low
1,4.95,94.8,4.7,469.26,3.3,24.5025,445.56,self-study,poor,medium
2,4.68,92.6,5.8,433.368,2.2,21.9024,537.08,coaching,poor,high
3,2.0,49.5,8.3,99.0,-0.3,4.0,410.85,group study,average,high
4,7.65,86.9,9.6,664.785,-1.6,58.5225,834.24,self-study,good,high


## Feature Encoding (categorical) and Scaling (numerical)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import joblib

sleep_quality_order = ['poor', 'average', 'good']
facility_rating_order = ['low', 'medium', 'high']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('ord', OrdinalEncoder(
            categories=[sleep_quality_order, facility_rating_order]
        ), cat_ordinal),
        ('onehot', OneHotEncoder(
            drop='first',
            handle_unknown='ignore'
        ), cat_onehot)
    ],
    remainder='drop'
)

joblib.dump(preprocessor, '../data/processed/preprocessor.pkl')

['../data/processed/preprocessor.pkl']

#### Apply Feature Engineering (Fit & Transform)

In [7]:
X = df_fe[num_features + cat_onehot + cat_ordinal]
y = df_fe[target]

X_processed = preprocessor.fit_transform(X)

In [8]:
feature_names = preprocessor.get_feature_names_out()

X_processed_df = pd.DataFrame(
    X_processed,
    columns=feature_names,
    index=df_fe.index
)

X_processed_df.columns = (
    X_processed_df.columns
    .str.replace('num__', '')
    .str.replace('ord__', '')
    .str.replace('onehot__', '')
    .str.replace(' ', '_')
)

#### Create final train dataset

In [9]:
train_df = X_processed_df.copy()
train_df[target] = y.values

In [10]:
train_df.head()

Unnamed: 0,study_hours,class_attendance,sleep_hours,study_attend_product,sleep_deficit,study_hours_sq,sleep_attend_product,sleep_quality,facility_rating,study_method_group_study,study_method_mixed,study_method_online_videos,study_method_self-study,exam_score
0,1.655875,1.538302,-1.245269,2.535425,1.245269,2.093542,-0.143196,1.0,0.0,0.0,0.0,1.0,0.0,78.3
1,0.401573,1.308814,-1.359895,0.919045,1.359895,0.148905,-0.356236,0.0,1.0,0.0,0.0,0.0,1.0,46.7
2,0.28716,1.182595,-0.729454,0.733247,0.729454,0.016075,0.149401,0.0,2.0,0.0,0.0,0.0,0.0,99.0
3,-0.848492,-1.290141,0.703367,-0.997639,-0.703367,-0.898495,-0.548004,1.0,2.0,1.0,0.0,0.0,0.0,63.9
4,1.545699,0.855575,1.448434,1.931198,-1.448434,1.886867,1.791175,2.0,2.0,0.0,0.0,0.0,1.0,100.0


In [11]:
train_df.to_csv('../data/processed/dataset_train.csv', index=False)

import json
feature_config = {
    'num_features': num_features,
    'cat_onehot': cat_onehot,
    'cat_ordinal': cat_ordinal,
    'target': target
}
with open('../data/processed/feature_config.json', 'w') as f:
    json.dump(feature_config, f, indent=2)