#Import neccesary libraries


In [2]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [3]:
#sample data

data= {
    'Age':[25,30,np.nan,45,50],
    'Gender':['Male','Female','Female','Male','Male'],
    'Salary':[50000,60000,55000,np.nan,75000],
    'Productivity':[10,8,9,7,11],
    'Text':['Hello','Hi','Hey','Hoi','Heiy'],
    'Religion':['religion1','relegion2','rerlegion3','religion3','relegion1']
}

#create a dataframe using the data set
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,50000.0,10,Hello,religion1
1,30.0,Female,60000.0,8,Hi,relegion2
2,,Female,55000.0,9,Hey,rerlegion3
3,45.0,Male,,7,Hoi,religion3
4,50.0,Male,75000.0,11,Heiy,relegion1


In [4]:
#Imputation

imputer = SimpleImputer(strategy = 'mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,50000.0,10,Hello,religion1
1,30.0,Female,60000.0,8,Hi,relegion2
2,37.5,Female,55000.0,9,Hey,rerlegion3
3,45.0,Male,,7,Hoi,religion3
4,50.0,Male,75000.0,11,Heiy,relegion1


In [5]:
#Normalization/ Scalling

scaler = MinMaxScaler()
df[['Salary','Productivity']] = scaler.fit_transform(df[['Salary','Productivity']])
df.head()

Unnamed: 0,Age,Gender,Salary,Productivity,Text,Religion
0,25.0,Male,0.0,0.75,Hello,religion1
1,30.0,Female,0.4,0.25,Hi,relegion2
2,37.5,Female,0.2,0.5,Hey,rerlegion3
3,45.0,Male,,0.0,Hoi,religion3
4,50.0,Male,1.0,1.0,Heiy,relegion1


In [7]:
#One-Hot Encoding

df = pd.get_dummies(df, columns = ['Gender'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male
0,25.0,0.0,0.75,Hello,religion1,False,True
1,30.0,0.4,0.25,Hi,relegion2,True,False
2,37.5,0.2,0.5,Hey,rerlegion3,True,False
3,45.0,,0.0,Hoi,religion3,False,True
4,50.0,1.0,1.0,Heiy,relegion1,False,True


In [9]:
#LABEL ENCODING
label_encoder = LabelEncoder()
df['Religion_Labelencoded'] = label_encoder.fit_transform(df['Religion'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_Labelencoded
0,25.0,0.0,0.75,Hello,religion1,False,True,2
1,30.0,0.4,0.25,Hi,relegion2,True,False,1
2,37.5,0.2,0.5,Hey,rerlegion3,True,False,4
3,45.0,,0.0,Hoi,religion3,False,True,3
4,50.0,1.0,1.0,Heiy,relegion1,False,True,0


In [11]:
#BINNING/DISCRETIZATION
df['Age_Bin'] = pd.cut(df['Age'], bins = 3, labels = ['Young','Middle-Aged','Old'])
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_Labelencoded,Age_Bin
0,25.0,0.0,0.75,Hello,religion1,False,True,2,Young
1,30.0,0.4,0.25,Hi,relegion2,True,False,1,Young
2,37.5,0.2,0.5,Hey,rerlegion3,True,False,4,Middle-Aged
3,45.0,,0.0,Hoi,religion3,False,True,3,Old
4,50.0,1.0,1.0,Heiy,relegion1,False,True,0,Old


In [12]:
#FEATURE INTERACTIONS
df['Salary_Productivity'] = df['Salary'] * df['Productivity']
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_Labelencoded,Age_Bin,Salary_Productivity
0,25.0,0.0,0.75,Hello,religion1,False,True,2,Young,0.0
1,30.0,0.4,0.25,Hi,relegion2,True,False,1,Young,0.1
2,37.5,0.2,0.5,Hey,rerlegion3,True,False,4,Middle-Aged,0.1
3,45.0,,0.0,Hoi,religion3,False,True,3,Old,
4,50.0,1.0,1.0,Heiy,relegion1,False,True,0,Old,1.0


In [14]:
#FILL THE MISSING VALUE OF THE SALARY COLOUMN
df['Salary'].fillna(0, inplace= True)
df['Salary_Productivity'].fillna(0, inplace = True)
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_Labelencoded,Age_Bin,Salary_Productivity
0,25.0,0.0,0.75,Hello,religion1,False,True,2,Young,0.0
1,30.0,0.4,0.25,Hi,relegion2,True,False,1,Young,0.1
2,37.5,0.2,0.5,Hey,rerlegion3,True,False,4,Middle-Aged,0.1
3,45.0,0.0,0.0,Hoi,religion3,False,True,3,Old,0.0
4,50.0,1.0,1.0,Heiy,relegion1,False,True,0,Old,1.0


In [19]:
#FEATURE SELECTION
selector = SelectKBest(score_func = f_regression, k=2)
X = df.drop(['Religion','Text','Age_Bin'], axis=1)
X_new = selector.fit_transform(X,df['Productivity'])
X_new

array([[0.75, 0.  ],
       [0.25, 0.1 ],
       [0.5 , 0.1 ],
       [0.  , 0.  ],
       [1.  , 1.  ]])

In [21]:
#DIMENSIONALITY REDUCTION
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
df['PCA1'] = principal_components[:, 0]
df['PCA2'] = principal_components[:, 1]
df.head()

Unnamed: 0,Age,Salary,Productivity,Text,Religion,Gender_Female,Gender_Male,Religion_Labelencoded,Age_Bin,Salary_Productivity,PCA1,PCA2
0,25.0,0.0,0.75,Hello,religion1,False,True,2,Young,0.0,-12.483189,-0.455256
1,30.0,0.4,0.25,Hi,relegion2,True,False,1,Young,0.1,-7.490425,-1.06347
2,37.5,0.2,0.5,Hey,rerlegion3,True,False,4,Middle-Aged,0.1,-0.076565,2.060047
3,45.0,0.0,0.0,Hoi,religion3,False,True,3,Old,0.0,7.465069,1.304399
4,50.0,1.0,1.0,Heiy,relegion1,False,True,0,Old,1.0,12.58511,-1.84572


In [22]:
#SPLIT DATA INTO TEST AND TRAIN DATA
X_train, X_test, y_train, y_test = train_test_split(X, df['Productivity'], test_size=0.5, random_state=42)

In [23]:
# MODEL TRAINING AND TESTING

model = LinearRegression()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 1.0
Test Score: -4.278425065373542
