In [46]:
import plotly.express as px
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [47]:
df = pd.read_excel('./dataset/RSW-dataset.xlsx')

In [49]:
df.head()

Unnamed: 0,NUGGET.WIDTH.1,MATERIAL.1,THICKNESS.1,COATING.EG.1,COATING.HDG.1,COATING.WEIGHT.1,SURFACE.CLASS.1,MATERIAL.2,THICKNESS.2,COATING.EG.2,COATING.HDG.2,COATING.WEIGHT.2,SURFACE.CLASS.2,WELD.FORCE,WELD.CURRENT,WELD.TIME
0,3.6,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,0,3,674,7.3,720.0
1,4.4,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,0,3,674,7.3,720.0
2,4.6,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,0,3,674,7.3,720.0
3,4.1,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,0,3,674,7.6,720.0
4,5.1,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,0,3,674,7.6,720.0


In [50]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [51]:
print('train_df.shape:', train_df.shape)
print('test_df.shape:', test_df.shape)

train_df.shape: (900, 16)
test_df.shape: (226, 16)


In [52]:
input_cols = list(train_df.columns)[1:]
target_col = 'NUGGET.WIDTH.1'

In [53]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [54]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [55]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [56]:
train_inputs[numeric_cols].describe()

Unnamed: 0,THICKNESS.1,COATING.EG.1,COATING.HDG.1,COATING.WEIGHT.1,SURFACE.CLASS.1,THICKNESS.2,COATING.EG.2,COATING.HDG.2,COATING.WEIGHT.2,SURFACE.CLASS.2,WELD.FORCE,WELD.CURRENT,WELD.TIME
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,1.197944,0.008889,0.567778,40.195556,1.864444,1.348167,0.0,0.567778,40.195556,1.864444,884.811111,7.960633,357.078822
std,0.16121,0.093913,0.49566,36.312045,0.991321,0.315293,0.0,0.49566,36.312045,0.991321,123.369869,1.738287,161.560156
min,0.8,0.0,0.0,0.0,1.0,0.8,0.0,0.0,0.0,1.0,499.0,4.2,166.7
25%,1.2,0.0,0.0,0.0,1.0,1.2,0.0,0.0,0.0,1.0,899.0,6.7,250.05
50%,1.2,0.0,1.0,60.0,1.0,1.2,0.0,1.0,60.0,1.0,910.0,8.1,300.06
75%,1.2,0.0,1.0,70.0,3.0,1.4,0.0,1.0,70.0,3.0,910.0,9.1,400.0
max,1.5,1.0,1.0,94.0,3.0,2.0,0.0,1.0,94.0,3.0,1124.0,13.5,800.0


In [57]:
train_inputs[categorical_cols].nunique()

MATERIAL.1    5
MATERIAL.2    5
dtype: int64

In [58]:
df[categorical_cols].nunique()

MATERIAL.1    5
MATERIAL.2    5
dtype: int64

In [59]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [60]:
encoder.fit(df[categorical_cols])

In [61]:
encoder.categories_

[array(['Material_A', 'Material_B', 'Material_C', 'Material_D',
        'Material_E'], dtype=object),
 array(['Material_A', 'Material_B', 'Material_C', 'Material_D',
        'Material_E'], dtype=object)]

In [62]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)

['MATERIAL.1_Material_A', 'MATERIAL.1_Material_B', 'MATERIAL.1_Material_C', 'MATERIAL.1_Material_D', 'MATERIAL.1_Material_E', 'MATERIAL.2_Material_A', 'MATERIAL.2_Material_B', 'MATERIAL.2_Material_C', 'MATERIAL.2_Material_D', 'MATERIAL.2_Material_E']


In [63]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [64]:
train_inputs

Unnamed: 0,MATERIAL.1,THICKNESS.1,COATING.EG.1,COATING.HDG.1,COATING.WEIGHT.1,SURFACE.CLASS.1,MATERIAL.2,THICKNESS.2,COATING.EG.2,COATING.HDG.2,...,MATERIAL.1_Material_A,MATERIAL.1_Material_B,MATERIAL.1_Material_C,MATERIAL.1_Material_D,MATERIAL.1_Material_E,MATERIAL.2_Material_A,MATERIAL.2_Material_B,MATERIAL.2_Material_C,MATERIAL.2_Material_D,MATERIAL.2_Material_E
875,Material_C,1.2,0,0,0,3,Material_C,1.2,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
712,Material_C,1.2,0,0,0,3,Material_C,2.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
554,Material_C,1.2,0,0,0,3,Material_C,2.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
688,Material_C,1.2,0,0,0,3,Material_C,2.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
780,Material_C,1.2,0,0,0,3,Material_C,1.2,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,Material_B,1.2,0,1,70,1,Material_B,1.2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
121,Material_A,1.4,0,1,70,1,Material_A,1.4,0,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1044,Material_D,1.2,0,1,94,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1095,Material_D,1.2,0,1,94,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [65]:
test_inputs

Unnamed: 0,MATERIAL.1,THICKNESS.1,COATING.EG.1,COATING.HDG.1,COATING.WEIGHT.1,SURFACE.CLASS.1,MATERIAL.2,THICKNESS.2,COATING.EG.2,COATING.HDG.2,...,MATERIAL.1_Material_A,MATERIAL.1_Material_B,MATERIAL.1_Material_C,MATERIAL.1_Material_D,MATERIAL.1_Material_E,MATERIAL.2_Material_A,MATERIAL.2_Material_B,MATERIAL.2_Material_C,MATERIAL.2_Material_D,MATERIAL.2_Material_E
1090,Material_D,1.2,0,1,94,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1083,Material_D,1.2,0,1,94,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
739,Material_C,1.2,0,0,0,3,Material_C,1.2,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
140,Material_A,1.4,0,1,70,1,Material_A,1.4,0,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1018,Material_D,1.2,0,1,70,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,Material_B,0.8,1,1,50,1,Material_B,0.8,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
429,Material_B,1.2,0,1,70,1,Material_B,1.2,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12,Material_E,0.8,0,0,0,3,Material_E,0.8,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1035,Material_D,1.2,0,1,94,1,Material_D,1.2,0,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [66]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

train_inputs: (900, 25)
train_targets: (900,)
test_inputs: (226, 25)
test_targets: (226,)


In [67]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [68]:
def evalute(targets, predictions,  name=''):
    mae = mean_absolute_error(targets, predictions)
    mse = mean_squared_error(targets, predictions)
    r2 =  r2_score(targets, predictions)
    
    print(name)
    print("mae:", mae)
    print("mse:", mse)
    print("r2=", r2)

# Linear Regression


In [69]:
lr_model = LinearRegression()
lr_model.fit(X_train, train_targets)

In [70]:
train_preds = lr_model.predict(X_train)
evalute(train_targets, train_preds, "Training")

Training
mae: 1.3636699000091175
mse: 2.9151529558501537
r2= 0.539099220564379


In [36]:
test_preds = lr_model.predict(X_test)
evalute(test_targets, test_preds, "Testing")

Testing
mae: 1.2323923886142745
mse: 2.493733735880014
r2= 0.5556668090084464


# K-Nearest Neighbors (KNN) 

In [71]:
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, train_targets)

In [72]:
train_preds = knn_model.predict(X_train)
evalute(train_targets, train_preds, "Training")

Training
mae: 1.0081555555555555
mse: 2.1207871111111114
r2= 0.6646925745125893


In [38]:
test_preds = knn_model.predict(X_test)
evalute(test_targets, test_preds, "Testing")

Testing
mae: 1.1983185840707966
mse: 2.993633628318584
r2= 0.46659470351953425


# Random Forest​

In [73]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, train_targets)

In [75]:
train_preds = rf_model.predict(X_train)
evalute(train_targets, train_preds, "Training")

Training
mae: 0.8269864199010866
mse: 1.5586086599955864
r2= 0.7535758990671636


In [76]:
test_preds = rf_model.predict(X_test)
evalute(test_targets, test_preds, "Testing")

Testing
mae: 1.3673557551531552
mse: 4.178042521863546
r2= 0.25555686273662803
