In [1]:
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load the model during startup
model_path = os.path.join("model", "linear_regression_model.pkl")
with open(model_path, 'rb') as f:
    model = pickle.load(f)
    
# Example of predicting UPH for new data
new_data = pd.DataFrame({
    'TESTER_ID': ['ADVT2000-10'],
    'handler_id': ['NS8160-18'],
    'product_no': ['UTIC*TT3920D'],
    'QTY_IN': [10608],
    'QTY_OUT': [8234]
})
print (new_data)
prediction = model.predict(new_data)
print (prediction)

     TESTER_ID handler_id    product_no  QTY_IN  QTY_OUT
0  ADVT2000-10  NS8160-18  UTIC*TT3920D   10608     8234
[1891.7415]


In [None]:
# using time module
import time

# ts stores the time in seconds
tstart = time.time()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('TMC_cleaning.csv')

# Define the features and target
X = data[['TESTER_ID', 'handler_id', 'product_no', 'QTY_IN', 'QTY_OUT']]
y = data['UPH']

# Identify categorical columns
categorical_features = ['TESTER_ID', 'handler_id', 'product_no']

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # Leave numerical features as they are
)

# Create the pipeline with preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Example of predicting UPH for new data
new_data = pd.DataFrame({
    'TESTER_ID': ['ADVT2000-10'],
    'handler_id': ['NS8160-18'],
    'product_no': ['UTIC*TT3920D'],
    'QTY_IN': [10608],
    'QTY_OUT': [8234]
})

predicted_uph = pipeline.predict(new_data)
print(f'Predicted UPH: {predicted_uph[0]}')

In [None]:
import pickle
# Save the trained model using pickle
model_path = os.path.join("", "linear_regression_model.pkl")
with open(model_path, 'wb') as f:
	pickle.dump(pipeline, f)

In [None]:
import os
# Load the model during startup
model_path = os.path.join("", "linear_regression_model.pkl")
with open(model_path, 'rb') as f:
    model = pickle.load(f)

In [None]:
# Predict on the entire dataset (including both training and test data)
data['UPH_predicted'] = model.predict(X)

In [None]:
# Create a new DataFrame with the original data and the predicted UPH
new_data = data.copy()

# Display the new DataFrame
print(new_data.head())

# Save the new DataFrame to a CSV file (optional)
new_data.to_csv('data_with_predicted_uph.csv', index=False)

In [None]:
new_data

In [None]:
df2 = new_data[new_data["UPH"] < 60000 ]
df2 = df2[df2["UPH_predicted"] < 60000 ]

In [None]:
# Show the joint distribution using kernel density estimation
import seaborn as sns

g = sns.jointplot(
    data=df2,
    x="UPH", y="UPH_predicted", hue="TEST_CODE",
    kind="kde",
)

In [None]:
sns.scatterplot(x='UPH', y='UPH_predicted', data=df2)

In [None]:
tstop = time.time()

# print the current timestamp
# print(tstop)
print((tstop-tstart)/3600,"hrs")