In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/Lijn 1 laaste 12 maanden.csv')

In [17]:
df.head()

Unnamed: 0,DateTime,EL04_Process_Recipe_Working.MESQuality.Density_Max,EL04_Process_Recipe_Working.MESQuality.Density_Min,EL04_Process_Recipe_Working.MESQuality.Density_Target,EL04_Dry_Feed_Rate_PID.OUT,EL04_Dry_Feed_Rate_PID.PV,EL04_Dry_Feed_Rate_PID.SP
0,11/1/2022 3:38:15 PM,472,408,440,40.103806,6995.532227,7000
1,11/1/2022 3:53:15 PM,488,424,456,40.182373,7022.82666,7000
2,11/1/2022 4:08:15 PM,488,424,456,39.94931,7024.14209,7000
3,11/1/2022 4:23:15 PM,488,424,456,40.241974,6956.40332,7000
4,11/1/2022 4:38:15 PM,488,424,456,40.27462,7009.181641,7000


In [18]:
features = [
    'EL04_Process_Recipe_Working.MESQuality.Density_Target',
    'EL04_Dry_Feed_Rate_PID.SP',
    'EL04_Dry_Feed_Rate_PID.PV'
]

X = sm.add_constant(df[features])  # Adding a constant for the intercept
y = df['EL04_Dry_Feed_Rate_PID.OUT']

# Data preprocessing
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df[features])  # Fix the DataFrame reference

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae

kf = KFold(n_splits=5,shuffle=True, random_state=2024)

#create a list to store validation scores for each fold

cv_lm_r2s = []
cv_lm_mae = []

#loop through each fold in x and y\
for train_ind, val_ind in kf.split(X,y):
    #Subset data based on CV folds
    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]
    #fit the model on folds training data
    model = sm.OLS(y_train, X_train).fit()
    #append Validation score to list
    cv_lm_r2s.append(r2(y_val, model.predict(X_val),))
    cv_lm_mae.append(mae(y_val, model.predict(X_val),))
    
print("All validation r2: ",[round(x,3 ) for x in cv_lm_r2s])   
print(f"Cross Val R2s: {round(np.mean(cv_lm_r2s), 3)} +- {round(np.std(cv_lm_r2s), 3)} ")

print("All validation MAEs: ",[round(x,3 ) for x in cv_lm_mae])   
print(f"Cross Val MAEs: {round(np.mean(cv_lm_mae), 3)} +- {round(np.std(cv_lm_mae), 3)} ")
    

All validation r2:  [0.893, 0.897, 0.885, 0.893, 0.894]
Cross Val R2s: 0.892 +- 0.004 
All validation MAEs:  [2.583, 2.483, 2.582, 2.509, 2.453]
Cross Val MAEs: 2.522 +- 0.052 


In [20]:
model = sm.OLS(y,X).fit()

print(f"Test r2:{r2(y_test,model.predict(X_test))}")
print(f"Test r2:{mae(y_test,model.predict(X_test))}")

Test r2:0.8910155957525681
Test r2:2.4971119078304302


In [21]:
model.summary()

summary_str = str(model.summary())
print(summary_str)

                                OLS Regression Results                                
Dep. Variable:     EL04_Dry_Feed_Rate_PID.OUT   R-squared:                       0.892
Model:                                    OLS   Adj. R-squared:                  0.892
Method:                         Least Squares   F-statistic:                 3.787e+04
Date:                        Sat, 11 Nov 2023   Prob (F-statistic):               0.00
Time:                                23:25:22   Log-Likelihood:                -35111.
No. Observations:                       13714   AIC:                         7.023e+04
Df Residuals:                           13710   BIC:                         7.026e+04
Df Model:                                   3                                         
Covariance Type:                    nonrobust                                         
                                                            coef    std err          t      P>|t|      [0.025      0.975]
--------

In [22]:
from sklearn.metrics import r2_score as r2

r2(y_test, model.predict((X_test)))


0.8910155957525681