In [71]:
%load_ext cuml.accel

The cuml.accel extension is already loaded. To reload it, use:
  %reload_ext cuml.accel


In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

Load the dataset

In [73]:
df = pd.read_csv('vegemite.csv')

Shuffle the dataset

In [74]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)


Check class distribution

In [75]:
target_col = df_shuffled.columns[-1]
print(f"Class distribution in full dataset:")
print(df_shuffled[target_col].value_counts().sort_index())

Class distribution in full dataset:
Class
0    2642
1    5047
2    7548
Name: count, dtype: int64


Extract balanced test set: at least 300 samples from each class

In [76]:
test_samples = []
train_samples = []

for class_label in sorted(df_shuffled[target_col].unique()):
    class_data = df_shuffled[df_shuffled[target_col] == class_label]

    n_test = min(334, len(class_data))

    class_train, class_test = train_test_split(
        class_data,
        test_size=n_test,
        random_state=42
    )

    test_samples.append(class_test)
    train_samples.append(class_train)

df_test = pd.concat(test_samples, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
df_train = pd.concat(train_samples, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Test set shape: {df_test.shape}")
print(f"Test set class distribution:\n{df_test[target_col].value_counts().sort_index()}")
print(f"\nTraining set shape: {df_train.shape}")
print(f"Training set class distribution:\n{df_train[target_col].value_counts().sort_index()}")

Test set shape: (1002, 47)
Test set class distribution:
Class
0    334
1    334
2    334
Name: count, dtype: int64

Training set shape: (14235, 47)
Training set class distribution:
Class
0    2308
1    4713
2    7214
Name: count, dtype: int64


In [77]:
df_test.to_csv('vegemite_test_1000.csv', index=False)

Check for constant value columns

In [78]:
constant_cols = []
for col in df_train.columns[:-1]:
    if df_train[col].nunique() == 1:
        constant_cols.append(col)

if constant_cols:
    print(f"Constant value columns found: {constant_cols}")
    df_train = df_train.drop(columns=constant_cols)
    print(f"Removed {len(constant_cols)} constant columns")
else:
    print("No constant value columns found")

Constant value columns found: ['TFE Steam temperature SP', 'TFE Product out temperature']
Removed 2 constant columns


Check for columns with few integer values (categorical)

In [79]:
categorical_cols = []
for col in df_train.columns[:-1]:
    if df_train[col].dtype in ['int64', 'float64']:
        unique_vals = df_train[col].nunique()
        if unique_vals <= 10:
            categorical_cols.append(col)
            df_train[col] = df_train[col].astype('category')

if categorical_cols:
    print(f"Columns converted to categorical: {categorical_cols}")
else:
    print("No columns need categorical conversion")

Columns converted to categorical: ['FFTE Feed tank level SP', 'FFTE Pump 1', 'FFTE Pump 1 - 2', 'FFTE Pump 2', 'TFE Motor speed']


Check class balance

In [80]:
X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]

print(f"Class distribution before balancing:")
print(y.value_counts().sort_index())
print(f"Class balance ratio: {y.value_counts().min() / y.value_counts().max():.2f}")

if y.value_counts().min() / y.value_counts().max() < 0.8:
    print("Applying SMOTE and undersampling...")
    cat_features = X.select_dtypes(include=['category']).columns
    for col in cat_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    print(f"Class distribution after SMOTE:")
    print(pd.Series(y_balanced).value_counts().sort_index())
else:
    X_balanced, y_balanced = X, y
    print("Classes are already balanced")

Class distribution before balancing:
Class
0    2308
1    4713
2    7214
Name: count, dtype: int64
Class balance ratio: 0.32
Applying SMOTE and undersampling...


Class distribution after SMOTE:
Class
0    7214
1    7214
2    7214
Name: count, dtype: int64


Create composite features

In [81]:
composite_features = pd.DataFrame()

sp_cols = [col for col in X_balanced.columns if 'SP' in col.upper()]
pv_cols = [col for col in X_balanced.columns if 'PV' in col.upper() or ('SP' not in col.upper())]

print(f"Found {len(sp_cols)} SP (setpoint) columns")
print(f"Found {len(pv_cols)} PV (process variable) columns")

numeric_cols = X_balanced.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
    composite_features['mean_all'] = X_balanced[numeric_cols].mean(axis=1)
    composite_features['std_all'] = X_balanced[numeric_cols].std(axis=1)

    sp_numeric = [col for col in sp_cols if col in numeric_cols]
    if len(sp_numeric) > 1:
        composite_features['mean_sp'] = X_balanced[sp_numeric].mean(axis=1)
        composite_features['std_sp'] = X_balanced[sp_numeric].std(axis=1)

    print(f"Created {len(composite_features.columns)} composite features")

X_final = pd.concat([X_balanced.reset_index(drop=True), composite_features], axis=1)

Found 10 SP (setpoint) columns
Found 34 PV (process variable) columns
Created 4 composite features


Final feature count

In [82]:
print(f"Total features in final dataset: {X_final.shape[1]}")
print(f"Original features: {X_balanced.shape[1]}")
print(f"Composite features: {composite_features.shape[1]}")

Total features in final dataset: 48
Original features: 44
Composite features: 4


In [None]:
df_train_processed = pd.concat([X_final, pd.Series(y_balanced, name=target_col)], axis=1)
df_train_processed.to_csv('vegemite_train_processed.csv', index=False)

Processed training data saved as 'vegemite_train_processed.csv'


Feature selection

In [84]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

selector = SelectKBest(score_func=f_classif, k=min(20, X_final.shape[1]))
X_selected = selector.fit_transform(X_scaled, y_balanced)

selected_mask = selector.get_support()
selected_features = X_final.columns[selected_mask].tolist()

print(f"Selected {len(selected_features)} features using SelectKBest (f_classif)")
print(f"Selected features: {selected_features[:10]}...")
print(f"Justification: Reduced features from {X_final.shape[1]} to {len(selected_features)}")
print(f"This reduces overfitting and improves model generalization")

Selected 20 features using SelectKBest (f_classif)
Selected features: ['FFTE Feed tank level SP', 'FFTE Production solids SP', 'TFE Out flow SP', 'TFE Vacuum pressure SP', 'FFTE Feed flow SP', 'FFTE Discharge density', 'FFTE Feed flow rate PV', 'FFTE Heat temperature 1', 'FFTE Temperature 1 - 1', 'FFTE Temperature 1 - 2']...
Justification: Reduced features from 48 to 20
This reduces overfitting and improves model generalization


Split data for training and validation

In [85]:
X_train, X_val, y_train, y_val = train_test_split(
    X_selected, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print(f"\nTraining set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")


Training set: (17313, 20)
Validation set: (4329, 20)


Train multiple ML models

In [86]:
models = {
    'DecisionTree': DecisionTreeClassifier(max_depth=10, min_samples_split=20, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'SVM': SVC(kernel='rbf', C=1.0, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

trained_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f"{name} trained successfully")

Training DecisionTree...
DecisionTree trained successfully
Training RandomForest...
RandomForest trained successfully
Training GradientBoosting...
GradientBoosting trained successfully
Training SVM...
SVM trained successfully
Training LogisticRegression...
LogisticRegression trained successfully
Training KNN...
KNN trained successfully


Evaluate models and create comparison table

In [87]:
results = []
for name, model in trained_models.items():
    y_pred = model.predict(X_val)

    print(f"Model: {name}")

    print("Classification Report:")
    print(classification_report(y_val, y_pred))

    print("Confusion Matrix:")
    cm = confusion_matrix(y_val, y_pred)
    print(f"{cm}\n\n")

    acc = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)

    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

comparison_df = pd.DataFrame(results)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)
print(comparison_df.to_string(index=False))
comparison_df.to_csv('model_comparison.csv', index=False)

Model: DecisionTree
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1443
           1       0.83      0.81      0.82      1443
           2       0.87      0.85      0.86      1443

    accuracy                           0.85      4329
   macro avg       0.85      0.85      0.85      4329
weighted avg       0.85      0.85      0.85      4329

Confusion Matrix:
[[1277   88   78]
 [ 171 1162  110]
 [  69  152 1222]]


Model: RandomForest
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      1443
           1       0.97      0.95      0.96      1443
           2       0.98      0.96      0.97      1443

    accuracy                           0.96      4329
   macro avg       0.96      0.96      0.96      4329
weighted avg       0.96      0.96      0.96      4329

Confusion Matrix:
[[1426   14    3]
 [  55 1366   22]
 [  28   33 1382]]


Mo

Select best model

In [88]:
best_model_name = comparison_df.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"Best performing model: {best_model_name}")
print(f"Justification: Highest F1-Score ({comparison_df.iloc[0]['F1-Score']:.4f})")
print(f"This model provides the best balance between precision and recall")

Best performing model: GradientBoosting
Justification: Highest F1-Score (0.9778)
This model provides the best balance between precision and recall


Save the best model

In [89]:
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(selector, 'feature_selector.pkl')
joblib.dump(selected_features, 'selected_features.pkl')

['selected_features.pkl']

ML TO AI

In [90]:
df_test_load = pd.read_csv('vegemite_test_1000.csv')
print(f"Loaded test set: {df_test_load.shape}")

loaded_model = joblib.load('best_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
loaded_selector = joblib.load('feature_selector.pkl')
print("Model and preprocessors loaded successfully")

Loaded test set: (1002, 47)
Model and preprocessors loaded successfully


Process each row and predict

In [91]:
predictions = []
actuals = []

for idx, row in df_test_load.iterrows():
    X_row = row[:-1].values.reshape(1, -1)
    y_actual = row.iloc[-1]

    if constant_cols:
        X_row = np.delete(X_row, [df_test_load.columns.get_loc(c) for c in constant_cols if c in df_test_load.columns], axis=1)

    X_row_df = pd.DataFrame(X_row, columns=X_final.columns[:X_row.shape[1]])

    if len(composite_features.columns) > 0:
        composite_row = pd.DataFrame()
        numeric_cols_test = X_row_df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols_test) > 1:
            composite_row['mean_all'] = [X_row_df[numeric_cols_test].mean(axis=1).values[0]]
            composite_row['std_all'] = [X_row_df[numeric_cols_test].std(axis=1).values[0]]

            sp_numeric_test = [col for col in sp_numeric if col in numeric_cols_test]
            if len(sp_numeric_test) > 1:
                composite_row['mean_sp'] = [X_row_df[sp_numeric_test].mean(axis=1).values[0]]
                composite_row['std_sp'] = [X_row_df[sp_numeric_test].std(axis=1).values[0]]

        X_row_final = pd.concat([X_row_df, composite_row], axis=1)
    else:
        X_row_final = X_row_df

    X_row_scaled = loaded_scaler.transform(X_row_final)
    X_row_selected = loaded_selector.transform(X_row_scaled)

    y_pred = loaded_model.predict(X_row_selected)[0]

    predictions.append(y_pred)
    actuals.append(y_actual)

    if idx % 200 == 0:
        print(f"Processed {idx} rows...")

print(f"Total predictions made: {len(predictions)}")


Processed 0 rows...
Processed 200 rows...
Processed 400 rows...
Processed 600 rows...
Processed 800 rows...
Processed 1000 rows...
Total predictions made: 1002


Measure performance on test set

In [92]:
print("Classification Report:")
print(classification_report(actuals, predictions))
print("Confusion Matrix:")
print(confusion_matrix(actuals, predictions))

test_accuracy = accuracy_score(actuals, predictions)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97       334
         1.0       0.96      0.95      0.95       334
         2.0       0.96      0.96      0.96       334

    accuracy                           0.96      1002
   macro avg       0.96      0.96      0.96      1002
weighted avg       0.96      0.96      0.96      1002

Confusion Matrix:
[[326   3   5]
 [ 10 317   7]
 [  3  11 320]]

Test Accuracy: 0.9611


Compare all models on test set

In [93]:
test_results = []
for name, model in trained_models.items():
    X_test_processed = []
    for idx, row in df_test_load.iterrows():
        X_row = row[:-1].values.reshape(1, -1)
        if constant_cols:
            X_row = np.delete(X_row, [df_test_load.columns.get_loc(c) for c in constant_cols if c in df_test_load.columns], axis=1)
        X_row_df = pd.DataFrame(X_row, columns=X_final.columns[:X_row.shape[1]])

        if len(composite_features.columns) > 0:
            composite_row = pd.DataFrame()
            numeric_cols_test = X_row_df.select_dtypes(include=[np.number]).columns
            if len(numeric_cols_test) > 1:
                composite_row['mean_all'] = [X_row_df[numeric_cols_test].mean(axis=1).values[0]]
                composite_row['std_all'] = [X_row_df[numeric_cols_test].std(axis=1).values[0]]
                sp_numeric_test = [col for col in sp_numeric if col in numeric_cols_test]
                if len(sp_numeric_test) > 1:
                    composite_row['mean_sp'] = [X_row_df[sp_numeric_test].mean(axis=1).values[0]]
                    composite_row['std_sp'] = [X_row_df[sp_numeric_test].std(axis=1).values[0]]
            X_row_final = pd.concat([X_row_df, composite_row], axis=1)
        else:
            X_row_final = X_row_df

        X_row_scaled = loaded_scaler.transform(X_row_final)
        X_row_selected = loaded_selector.transform(X_row_scaled)
        X_test_processed.append(X_row_selected[0])

    X_test_array = np.array(X_test_processed)
    y_test_pred = model.predict(X_test_array)

    acc = accuracy_score(actuals, y_test_pred)
    report = classification_report(actuals, y_test_pred, output_dict=True)

    test_results.append({
        'Model': name,
        'Test Accuracy': acc,
        'Test F1-Score': report['weighted avg']['f1-score']
    })

    print(f"{name}: Accuracy = {acc:.4f}, F1-Score = {report['weighted avg']['f1-score']:.4f}")

test_comparison_df = pd.DataFrame(test_results).sort_values('Test F1-Score', ascending=False)
print("\nTest Set Model Comparison:")
print(test_comparison_df.to_string(index=False))

if test_comparison_df.iloc[0]['Model'] == best_model_name:
    print(f"\n{best_model_name} is still the best performer on test data")
else:
    print(f"\n{test_comparison_df.iloc[0]['Model']} performed best on test data")
    print(f"Original selection {best_model_name} ranked #{test_comparison_df[test_comparison_df['Model']==best_model_name].index[0]+1}")


DecisionTree: Accuracy = 0.7764, F1-Score = 0.7761
RandomForest: Accuracy = 0.9421, F1-Score = 0.9421
GradientBoosting: Accuracy = 0.9611, F1-Score = 0.9610
SVM: Accuracy = 0.3333, F1-Score = 0.1667
LogisticRegression: Accuracy = 0.4202, F1-Score = 0.3239
KNN: Accuracy = 0.8782, F1-Score = 0.8782

Test Set Model Comparison:
             Model  Test Accuracy  Test F1-Score
  GradientBoosting       0.961078       0.961047
      RandomForest       0.942116       0.942074
               KNN       0.878244       0.878227
      DecisionTree       0.776447       0.776141
LogisticRegression       0.420160       0.323858
               SVM       0.333333       0.166667

GradientBoosting is still the best performer on test data


DEVELOP RULES FROM ML MODEL

In [94]:
sp_columns = [col for col in X_final.columns if 'SP' in col.upper()]
print(f"{len(sp_columns)} SP (setpoint) columns: {sp_columns}")

if len(sp_columns) > 0:
    X_sp = X_final[sp_columns]

    dt_sp = DecisionTreeClassifier(max_depth=5, min_samples_split=50, random_state=42)
    dt_sp.fit(X_sp, y_balanced)

    tree_rules = export_text(dt_sp, feature_names=sp_columns)
    print("\nDecision Tree Rules (SP Features):")
    print(tree_rules)

    with open('decision_tree_rules.txt', 'w') as f:
        f.write(tree_rules)

    importances = pd.DataFrame({
        'Feature': sp_columns,
        'Importance': dt_sp.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 5 Most Important SP Features:")
    print(importances.head().to_string(index=False))

12 SP (setpoint) columns: ['FFTE Feed tank level SP', 'FFTE Production solids SP', 'FFTE Steam pressure SP', 'TFE Out flow SP', 'TFE Production solids SP', 'TFE Vacuum pressure SP', 'TFE Steam pressure SP', 'FFTE Feed flow SP', 'FFTE Out steam temp SP', 'TFE Motor speed', 'mean_sp', 'std_sp']

Decision Tree Rules (SP Features):
|--- std_sp <= 3234.26
|   |--- std_sp <= 2958.06
|   |   |--- FFTE Steam pressure SP <= 123.00
|   |   |   |--- mean_sp <= 1165.11
|   |   |   |   |--- TFE Out flow SP <= 2212.14
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- TFE Out flow SP >  2212.14
|   |   |   |   |   |--- class: 0
|   |   |   |--- mean_sp >  1165.11
|   |   |   |   |--- FFTE Out steam temp SP <= 49.32
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- FFTE Out steam temp SP >  49.32
|   |   |   |   |   |--- class: 0
|   |   |--- FFTE Steam pressure SP >  123.00
|   |   |   |--- TFE Out flow SP <= 2009.74
|   |   |   |   |--- FFTE Production solids SP <= 40.20
|   |   |   |   | 