### Feature Engineering Techniques
- Imputation: Filling missing values with the mean.
- Normalization/Scaling: Scaling numerical features using Min-Max scaling.
- One-Hot Encoding: Encoding categorical variables.
- Label Encoding: Encoding categorical variables into numerical format.
- Binning/Discretization: Grouping numerical variables into bins.
- Feature Interactions: Creating new features from existing ones.
- Feature Selection: Selecting the best features using SelectKBest.
- Dimensionality Reduction: Reducing dimensionality using PCA.

In [33]:
# Import necessary library
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [8]:

# Sample data
data = {
    'Age': [25, 30, np.nan, 35, 40],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Salary': [50000, 60000, 55000, np.nan, 70000],
    'Productivity': [10, 8, 9, 7, 11],
    'Text': ['hello world', 'python is great', 'machine learning', 'data science', 'feature engineering'],
    'Religion':["religion1","religion2", "religion3", "religion3", "religion1"]
}

# Create a dataframe using the data
df = pd.DataFrame(data)
df.head()


Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,50000.0,10,hello world,religion1
1,30.0,Female,60000.0,8,python is great,religion2
2,,Female,55000.0,9,machine learning,religion3
3,35.0,Male,,7,data science,religion3
4,40.0,Male,70000.0,11,feature engineering,religion1


In [9]:
# Imputation
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,50000.0,10,hello world,religion1
1,30.0,Female,60000.0,8,python is great,religion2
2,32.5,Female,55000.0,9,machine learning,religion3
3,35.0,Male,,7,data science,religion3
4,40.0,Male,70000.0,11,feature engineering,religion1


In [10]:
# Normalization/Scaling
scaler = MinMaxScaler()
df[['Salary', 'Productivity']] = scaler.fit_transform(df[['Salary', 'Productivity']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,0.0,0.75,hello world,religion1
1,30.0,Female,0.5,0.25,python is great,religion2
2,32.5,Female,0.25,0.5,machine learning,religion3
3,35.0,Male,,0.0,data science,religion3
4,40.0,Male,1.0,1.0,feature engineering,religion1


In [11]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['Gender'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male
0,25.0,0.0,0.75,hello world,religion1,False,True
1,30.0,0.5,0.25,python is great,religion2,True,False
2,32.5,0.25,0.5,machine learning,religion3,True,False
3,35.0,,0.0,data science,religion3,False,True
4,40.0,1.0,1.0,feature engineering,religion1,False,True


In [12]:
# Label Encoding
label_encoder = LabelEncoder()
df['Religion_LabelEncoded'] = label_encoder.fit_transform(df['Religion'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded
0,25.0,0.0,0.75,hello world,religion1,False,True,0
1,30.0,0.5,0.25,python is great,religion2,True,False,1
2,32.5,0.25,0.5,machine learning,religion3,True,False,2
3,35.0,,0.0,data science,religion3,False,True,2
4,40.0,1.0,1.0,feature engineering,religion1,False,True,0


In [13]:
# Binning/Discretization
df['Age_Bin'] = pd.cut(df['Age'], bins=3, labels=['Young', 'Middle-aged', 'Old'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin
0,25.0,0.0,0.75,hello world,religion1,False,True,0,Young
1,30.0,0.5,0.25,python is great,religion2,True,False,1,Young
2,32.5,0.25,0.5,machine learning,religion3,True,False,2,Middle-aged
3,35.0,,0.0,data science,religion3,False,True,2,Middle-aged
4,40.0,1.0,1.0,feature engineering,religion1,False,True,0,Old


In [19]:
# Feature Interactions
df['Salary_Productivity'] = df['Salary'] * df['Productivity']
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin,Salary_Productivity
0,25.0,0.0,0.75,hello world,religion1,False,True,0,Young,0.0
1,30.0,0.5,0.25,python is great,religion2,True,False,1,Young,0.125
2,32.5,0.25,0.5,machine learning,religion3,True,False,2,Middle-aged,0.125
3,35.0,,0.0,data science,religion3,False,True,2,Middle-aged,
4,40.0,1.0,1.0,feature engineering,religion1,False,True,0,Old,1.0


In [21]:
# Fill the missing value of the salary column
df['Salary'].fillna(0, inplace=True)
df['Salary_Productivity'].fillna(0, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary_Productivity'].fillna(0, inplace=True)


Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin,Salary_Productivity
0,25.0,0.0,0.75,hello world,religion1,False,True,0,Young,0.0
1,30.0,0.5,0.25,python is great,religion2,True,False,1,Young,0.125
2,32.5,0.25,0.5,machine learning,religion3,True,False,2,Middle-aged,0.125
3,35.0,0.0,0.0,data science,religion3,False,True,2,Middle-aged,0.0
4,40.0,1.0,1.0,feature engineering,religion1,False,True,0,Old,1.0


In [24]:

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=2)
X = df.drop(['Religion', 'Text', 'Age_Bin'], axis=1)  # dropping non-numeric columns for feature selection
X_new = selector.fit_transform(X, df['Productivity'])
X_new

array([[0.75, 0.  ],
       [0.25, 1.  ],
       [0.5 , 2.  ],
       [0.  , 2.  ],
       [1.  , 0.  ]])

In [25]:

# Dimensionality Reduction
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
df['PCA1'] = principal_components[:, 0]
df['PCA2'] = principal_components[:, 1]
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_LabelEncoded,Age_Bin,Salary_Productivity,PCA1,PCA2
0,25.0,0.0,0.75,hello world,religion1,False,True,0,Young,0.0,-7.505865,1.161722
1,30.0,0.5,0.25,python is great,religion2,True,False,1,Young,0.125,-2.51753,-0.374557
2,32.5,0.25,0.5,machine learning,religion3,True,False,2,Middle-aged,0.125,-0.017055,-1.222036
3,35.0,0.0,0.0,data science,religion3,False,True,2,Middle-aged,0.0,2.489574,-0.883621
4,40.0,1.0,1.0,feature engineering,religion1,False,True,0,Old,1.0,7.550876,1.318492


In [31]:

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Productivity'], test_size=0.5, random_state=42)


In [35]:

# Model training and testing
model = LinearRegression()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)


Train Score: 1.0
Test Score: -5.415038241923456
