In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split 
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np 
import os 

# Change directory to your dataset location (if necessary)
os.chdir(r"D:\Datasets")

# Read the CSV data
chem = pd.read_csv("ChemicalProcess.csv")

# Optional: Print missing value counts
# print(chem.isnull().sum()) 

# Separate features (X) and target variable (y)
X = chem.drop('Yield', axis=1)
y = chem['Yield']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   test_size=0.3,
                                   random_state=24)

# Create the imputer
imp = SimpleImputer(strategy='mean').set_output(transform='pandas')  
#The set_output method in scikit-learn is used to configure the output data 
#container of transformers to be a pandas DataFrame, instead of the default NumPy array.

# Create a model with linear regression 
X_trn_imp = imp.fit_transform(X_train)
lr = LinearRegression()
lr.fit(X_trn_imp, y_train)

# Test set Operations
X_tst_imp = imp.transform(X_test)
y_pred = lr.predict(X_tst_imp)
print('R-squared score (Linear Regression)',r2_score(y_test, y_pred))


R-squared score (Linear Regression) 0.23930185859422692


In [21]:

#another way to solve this problem with Pipeline  (Linear_Regression_with_Polynomial_Pipeline)
poly = PolynomialFeatures(degree = 2)
pipe =Pipeline([('IMP',imp),('POLY',imp),('LR',lr)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R-squared score (Linear Regression) using Pipeline: ',r2_score(y_test, y_pred))

R-squared score (Linear Regression) using Pipeline:  0.23930185859422692
