In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
import pickle

# Load dataset
df = pd.read_csv("manufacturing_dataset_1000_samples.csv")

cat_feats = ['Shift','Machine_Type','Material_Grade','Day_of_Week']
num_feats = ['Injection_Temperature','Injection_Pressure','Cycle_Time','Cooling_Time',
             'Material_Viscosity','Ambient_Temperature','Machine_Age','Operator_Experience',
             'Maintenance_Hours','Temperature_Pressure_Ratio','Total_Cycle_Time',
             'Efficiency_Score','Machine_Utilization']

X = df[cat_feats + num_feats]
y = df['Parts_Per_Hour']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing with imputer for missing values
num_transformer = SimpleImputer(strategy='mean')  # fill numeric NaN with mean
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # fill categorical NaN
    ('onehot', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_feats),
    ('cat', cat_transformer, cat_feats)
])

# Pipeline
pipeline = Pipeline([
    ('pre', preprocessor),
    ('lr', LinearRegression())
])

# Train
pipeline.fit(X_train, y_train)

# Save model
pickle.dump(pipeline, open("linear_pipeline.pkl","wb"))

# Evaluate
y_pred = pipeline.predict(X_test)
print("R2:", r2_score(y_test, y_pred))


R2: 0.905880008998877


2025-09-07 21:18:59.599 
  command:

    streamlit run C:\Users\Namtha\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-09-07 21:18:59.654 Session state does not function when running a script without `streamlit run`
