In [16]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [17]:
chem = pd.read_csv(r"D:\CDAC\PML\Datasets\ChemicalProcess.csv")
chem.shape

(176, 58)

In [18]:
chem.columns

Index(['Yield', 'BiologicalMaterial01', 'BiologicalMaterial02',
       'BiologicalMaterial03', 'BiologicalMaterial04', 'BiologicalMaterial05',
       'BiologicalMaterial06', 'BiologicalMaterial07', 'BiologicalMaterial08',
       'BiologicalMaterial09', 'BiologicalMaterial10', 'BiologicalMaterial11',
       'BiologicalMaterial12', 'ManufacturingProcess01',
       'ManufacturingProcess02', 'ManufacturingProcess03',
       'ManufacturingProcess04', 'ManufacturingProcess05',
       'ManufacturingProcess06', 'ManufacturingProcess07',
       'ManufacturingProcess08', 'ManufacturingProcess09',
       'ManufacturingProcess10', 'ManufacturingProcess11',
       'ManufacturingProcess12', 'ManufacturingProcess13',
       'ManufacturingProcess14', 'ManufacturingProcess15',
       'ManufacturingProcess16', 'ManufacturingProcess17',
       'ManufacturingProcess18', 'ManufacturingProcess19',
       'ManufacturingProcess20', 'ManufacturingProcess21',
       'ManufacturingProcess22', 'ManufacturingProce

In [19]:
np.sum(chem.isnull())

Yield                      0
BiologicalMaterial01       0
BiologicalMaterial02       0
BiologicalMaterial03       0
BiologicalMaterial04       0
BiologicalMaterial05       0
BiologicalMaterial06       0
BiologicalMaterial07       0
BiologicalMaterial08       0
BiologicalMaterial09       0
BiologicalMaterial10       0
BiologicalMaterial11       0
BiologicalMaterial12       0
ManufacturingProcess01     1
ManufacturingProcess02     3
ManufacturingProcess03    15
ManufacturingProcess04     1
ManufacturingProcess05     1
ManufacturingProcess06     2
ManufacturingProcess07     1
ManufacturingProcess08     1
ManufacturingProcess09     0
ManufacturingProcess10     9
ManufacturingProcess11    10
ManufacturingProcess12     1
ManufacturingProcess13     0
ManufacturingProcess14     1
ManufacturingProcess15     0
ManufacturingProcess16     0
ManufacturingProcess17     0
ManufacturingProcess18     0
ManufacturingProcess19     0
ManufacturingProcess20     0
ManufacturingProcess21     0
ManufacturingP

### Mean Imputation

In [20]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Assuming 'chem' is your DataFrame with missing values
imp = SimpleImputer(strategy='mean')
imputed_array = imp.fit_transform(chem)
imputed_df = pd.DataFrame(imputed_array, columns=chem.columns)


In [21]:
X = imputed_df.drop('Yield', axis=1)
y = imputed_df['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=23)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((123, 57), (53, 57), (123,), (53,))

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.4071816827272374


In [23]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.931544327005039


In [24]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.31399396219208


### Median Imputation

In [26]:
# Assuming 'chem' is your DataFrame with missing values
imp = SimpleImputer(strategy='median')
imputed_array = imp.fit_transform(chem)
imputed_df = pd.DataFrame(imputed_array, columns=chem.columns)

In [28]:
X = imputed_df.drop('Yield', axis=1)
y = imputed_df['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=23)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((123, 57), (53, 57), (123,), (53,))

In [29]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.3947982597867259


In [30]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.252078970928933


In [31]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-12.38405245330541
