# Feature Creation

In [1]:
import pandas as pd
df=pd.DataFrame({
    "Study_Hours":[2,4,6,8,10],
    "Attendance":[60,70,80,90,95],
    "Maths":[40,55,65,75,80],
    "Science":[42,58,68,78,88]
})
df

Unnamed: 0,Study_Hours,Attendance,Maths,Science
0,2,60,40,42
1,4,70,55,58
2,6,80,65,68
3,8,90,75,78
4,10,95,80,88


# Domain-Based Feature

In [2]:
df["Total_Marks"]=df["Maths"]+df["Science"]
df["Total_Marks"]

0     82
1    113
2    133
3    153
4    168
Name: Total_Marks, dtype: int64

# Mathematical Features

In [3]:
df["Marks_per_Hour"]=df["Total_Marks"]/df["Study_Hours"]
df["Marks_per_Hour"]

0    41.000000
1    28.250000
2    22.166667
3    19.125000
4    16.800000
Name: Marks_per_Hour, dtype: float64

# Interaction Feature

In [4]:
df["Study_Attendance_Interaction"]=df["Study_Hours"]*df["Attendance"]
df

Unnamed: 0,Study_Hours,Attendance,Maths,Science,Total_Marks,Marks_per_Hour,Study_Attendance_Interaction
0,2,60,40,42,82,41.0,120
1,4,70,55,58,113,28.25,280
2,6,80,65,68,133,22.166667,480
3,8,90,75,78,153,19.125,720
4,10,95,80,88,168,16.8,950


# Polynomial Feature

In [5]:
df["Study_Hours_Squared"]=df["Study_Hours"]**2
df["Study_Hours_Squared"]

0      4
1     16
2     36
3     64
4    100
Name: Study_Hours_Squared, dtype: int64

In [6]:
df

Unnamed: 0,Study_Hours,Attendance,Maths,Science,Total_Marks,Marks_per_Hour,Study_Attendance_Interaction,Study_Hours_Squared
0,2,60,40,42,82,41.0,120,4
1,4,70,55,58,113,28.25,280,16
2,6,80,65,68,133,22.166667,480,36
3,8,90,75,78,153,19.125,720,64
4,10,95,80,88,168,16.8,950,100


In [7]:
df

Unnamed: 0,Study_Hours,Attendance,Maths,Science,Total_Marks,Marks_per_Hour,Study_Attendance_Interaction,Study_Hours_Squared
0,2,60,40,42,82,41.0,120,4
1,4,70,55,58,113,28.25,280,16
2,6,80,65,68,133,22.166667,480,36
3,8,90,75,78,153,19.125,720,64
4,10,95,80,88,168,16.8,950,100


In [8]:
X=df.drop("Science",axis=1)
y=df["Science"]

In [9]:
X

Unnamed: 0,Study_Hours,Attendance,Maths,Total_Marks,Marks_per_Hour,Study_Attendance_Interaction,Study_Hours_Squared
0,2,60,40,82,41.0,120,4
1,4,70,55,113,28.25,280,16
2,6,80,65,133,22.166667,480,36
3,8,90,75,153,19.125,720,64
4,10,95,80,168,16.8,950,100


In [10]:
y

0    42
1    58
2    68
3    78
4    88
Name: Science, dtype: int64

# Filter Methods (Stastistics-Based)
# Correlation Methods

In [11]:
df.corr()["Science"].sort_values(ascending=False)

Science                         1.000000
Total_Marks                     0.998981
Maths                           0.995467
Study_Hours                     0.994309
Attendance                      0.994110
Study_Attendance_Interaction    0.983804
Study_Hours_Squared             0.958102
Marks_per_Hour                 -0.968737
Name: Science, dtype: float64

# Selecting Important Features

In [13]:
corr=df.corr()["Science"].abs()

In [14]:
corr

Study_Hours                     0.994309
Attendance                      0.994110
Maths                           0.995467
Science                         1.000000
Total_Marks                     0.998981
Marks_per_Hour                  0.968737
Study_Attendance_Interaction    0.983804
Study_Hours_Squared             0.958102
Name: Science, dtype: float64

In [15]:
selected_features=corr[corr>0.8].index
selected_features

Index(['Study_Hours', 'Attendance', 'Maths', 'Science', 'Total_Marks',
       'Marks_per_Hour', 'Study_Attendance_Interaction',
       'Study_Hours_Squared'],
      dtype='object')