## Feature Creation

In [1]:
import pandas as pd
df = pd.DataFrame({
    "study hours": [2, 4, 6, 8, 10],
    "attendance": [60, 70, 80, 90, 95],
    "maths": [40, 55, 65, 75, 80],
    "science": [42, 58, 68, 78, 88]
})
df

Unnamed: 0,study hours,attendance,maths,science
0,2,60,40,42
1,4,70,55,58
2,6,80,65,68
3,8,90,75,78
4,10,95,80,88


## Domain-Based Features (Total Marks)

In [2]:
df["total-marks"] = df["maths"] + df["science"]
df

Unnamed: 0,study hours,attendance,maths,science,total-marks
0,2,60,40,42,82
1,4,70,55,58,113
2,6,80,65,68,133
3,8,90,75,78,153
4,10,95,80,88,168


## Mathematical Feature (Marks per Hour)

In [3]:
df["marks_per_hour"] = df["total-marks"] / df["study hours"]
df


Unnamed: 0,study hours,attendance,maths,science,total-marks,marks_per_hour
0,2,60,40,42,82,41.0
1,4,70,55,58,113,28.25
2,6,80,65,68,133,22.166667
3,8,90,75,78,153,19.125
4,10,95,80,88,168,16.8


## Interaction Feature

In [4]:
df["study_attendance_interaction"] = df["study hours"] + df["attendance"]
df


Unnamed: 0,study hours,attendance,maths,science,total-marks,marks_per_hour,study_attendance_interaction
0,2,60,40,42,82,41.0,62
1,4,70,55,58,113,28.25,74
2,6,80,65,68,133,22.166667,86
3,8,90,75,78,153,19.125,98
4,10,95,80,88,168,16.8,105


## Polynomial Feature

In [5]:
df["study_hour_squared"] = df["study hours"] ** 2
df


Unnamed: 0,study hours,attendance,maths,science,total-marks,marks_per_hour,study_attendance_interaction,study_hour_squared
0,2,60,40,42,82,41.0,62,4
1,4,70,55,58,113,28.25,74,16
2,6,80,65,68,133,22.166667,86,36
3,8,90,75,78,153,19.125,98,64
4,10,95,80,88,168,16.8,105,100


## Feature Selection (X and y)

In [6]:
x = df.drop("science", axis=1)
y = df["science"]

x


Unnamed: 0,study hours,attendance,maths,total-marks,marks_per_hour,study_attendance_interaction,study_hour_squared
0,2,60,40,82,41.0,62,4
1,4,70,55,113,28.25,74,16
2,6,80,65,133,22.166667,86,36
3,8,90,75,153,19.125,98,64
4,10,95,80,168,16.8,105,100


In [7]:
y

0    42
1    58
2    68
3    78
4    88
Name: science, dtype: int64

## Filter Method (Correlation â€“ Statistical Based)

In [8]:
df.corr()["science"].sort_values(ascending=False)


science                         1.000000
total-marks                     0.998981
maths                           0.995467
study_attendance_interaction    0.995048
study hours                     0.994309
attendance                      0.994110
study_hour_squared              0.958102
marks_per_hour                 -0.968737
Name: science, dtype: float64

## Selecting Important Features (Absolute Correlation)

In [9]:
corr = df.corr()["science"].abs()   # converts negative values to positive
corr


study hours                     0.994309
attendance                      0.994110
maths                           0.995467
science                         1.000000
total-marks                     0.998981
marks_per_hour                  0.968737
study_attendance_interaction    0.995048
study_hour_squared              0.958102
Name: science, dtype: float64