# FEATURE ENGINEERING

In [9]:
import pandas as pd

df = pd.read_csv('../data/interim/data_cleaned.csv')

#### Drop irrelevant columns

In [10]:
drop_cols = [
    'age',
    'gender',
    'internet_access',
    'course',
    'exam_difficulty'
]

df_fe = df.drop(columns=drop_cols)


#### Define target and feature groups

In [11]:
target = 'exam_score'

num_features = [
    'study_hours',
    'class_attendance',
    'sleep_hours'
]

cat_onehot = [
    'study_method'
]

cat_ordinal = [
    'sleep_quality',
    'facility_rating'
]


## Feature Encoding (categorical) and Scaling (numerical)

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

sleep_quality_order = ['poor', 'average', 'good']
facility_rating_order = ['low', 'medium', 'high']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('ord', OrdinalEncoder(
            categories=[sleep_quality_order, facility_rating_order]
        ), cat_ordinal),
        ('onehot', OneHotEncoder(
            drop='first',
            handle_unknown='ignore'
        ), cat_onehot)
    ],
    remainder='drop'
)


#### Apply Feature Engineering (Fit & Transform)

In [36]:
X = df_fe[num_features + cat_onehot + cat_ordinal]
y = df_fe[target]

X_processed = preprocessor.fit_transform(X)

In [27]:
feature_names = preprocessor.get_feature_names_out()

X_processed_df = pd.DataFrame(
    X_processed,
    columns=feature_names,
    index=df_fe.index
)

X_processed_df.columns = (
    X_processed_df.columns
    .str.replace('num__', '')
    .str.replace('ord__', '')
    .str.replace('onehot__', '')
    .str.replace(' ', '_')
)

#### Create final train dataset

In [38]:
train_df = X_processed_df.copy()
train_df[target] = y.values

In [39]:
train_df.head()

Unnamed: 0,study_hours,class_attendance,sleep_hours,sleep_quality,facility_rating,study_method_group_study,study_method_mixed,study_method_online_videos,study_method_self-study,exam_score
0,1.655875,1.538302,-1.245269,1.0,0.0,0.0,0.0,1.0,0.0,78.3
1,0.401573,1.308814,-1.359895,0.0,1.0,0.0,0.0,0.0,1.0,46.7
2,0.28716,1.182595,-0.729454,0.0,2.0,0.0,0.0,0.0,0.0,99.0
3,-0.848492,-1.290141,0.703367,1.0,2.0,1.0,0.0,0.0,0.0,63.9
4,1.545699,0.855575,1.448434,2.0,2.0,0.0,0.0,0.0,1.0,100.0


In [44]:
train_df.to_csv('../data/processed/dataset_train.csv', index=False)