In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm

import imblearn

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


  from pandas import MultiIndex, Int64Index


# Load data

In [3]:
df = pd.read_csv('../data/output/df_ml.csv', sep='\t')

In [4]:
df.head()

Unnamed: 0,material_number,material_number_text,brand,product_area,core_segment,component,component_text,manufactoring_location,characteristic_value,material_weight,b_code,packaging_category
0,06159975BT,Counter Display,Bosch,PA5,Metal Grinding,6035765C21,Corrugated carton,Distribution Center,CORRUGATED,85.0,153664,D1 - Countertop display
1,06159975BT,Counter Display,Bosch,PA5,Metal Grinding,6035940565,Label SB,Distribution Center,WOOD FREE,0.54,204102,D1 - Countertop display
2,06159975BT,Counter Display,Bosch,PA5,Metal Grinding,6035822768,Tight -Pack label RB - 1ER,Distribution Center,MCB/GT2,22.9,303917,D1 - Countertop display
3,06159975BT,Counter Display,Bosch,PA5,Metal Grinding,6035822768,Tight -Pack label RB - 1ER,Distribution Center,MCB/GT2,22.9,303917,D1 - Countertop display
4,06159975BT,Counter Display,Bosch,PA5,Metal Grinding,6035765P54,Corrugated carton,Distribution Center,CORRUGATED,85.0,153664,D1 - Countertop display


# Prep data for machine learning

## Clean data

In [5]:
df.b_code = df.b_code.astype('object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82969 entries, 0 to 82968
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   material_number         82969 non-null  object 
 1   material_number_text    82969 non-null  object 
 2   brand                   82969 non-null  object 
 3   product_area            82969 non-null  object 
 4   core_segment            82969 non-null  object 
 5   component               82969 non-null  object 
 6   component_text          82969 non-null  object 
 7   manufactoring_location  82969 non-null  object 
 8   characteristic_value    82969 non-null  object 
 9   material_weight         75906 non-null  float64
 10  b_code                  82969 non-null  object 
 11  packaging_category      82969 non-null  object 
dtypes: float64(1), object(11)
memory usage: 7.6+ MB


In [7]:
df_sub = df[[
    'material_number',
    'brand',
    'product_area',
    'core_segment',
    'component',
    'manufactoring_location',
    'characteristic_value',
    'material_weight', 
    'b_code',
    'packaging_category',
]]

## Split data into train/test

In [8]:
# Define features and target
X = df_sub.iloc[:, :-1]
y = df_sub.iloc[:, -1]  # the last column is the target

In [9]:
# Generate train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

## RandomForestClassifier pipeline

In [12]:
# DEFINE PIPELINE

# DEFINE PREPROCESSING PIPELINE
# define numerical feature processing
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
# print(f'There are {len(numerical_features)} numerical features:', '\n')
# print(numerical_features)
numeric_feature_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('log_transform', PowerTransformer()),
    # ('scale', MinMaxScaler())
])
# define categorical feature processing
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
# print(f'There are {len(categorical_features)} categorical features:', '\n')
# print(categorical_features)
categorical_feature_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    # ('one_hot', OneHotEncoder(handle_unknown='ignore', max_categories=None, sparse=False))
])
# apply both pipeline on seperate columns using "ColumnTransformer"
preprocess_pipeline = ColumnTransformer(transformers=[
    ('number', numeric_feature_pipeline, numerical_features),
    ('category', categorical_feature_pipeline, categorical_features)
])

X_train_processed = preprocess_pipeline.fit_transform(X_train)

# Training
label_ecoder = LabelEncoder()
y_train_encoded = pd.DataFrame(label_ecoder.fit_transform(y_train))
#y_train_encoded = label_ecoder.fit_transform(y_train)

# class_weights
class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train_encoded[0].to_numpy()),
    y=y_train_encoded[0].to_numpy()
)
class_weight_dict = dict(enumerate(class_weights))

# DEFINE MODEL PIPELINE
# model
rf_clf = RandomForestClassifier(
    # n_estimators=1000,
    class_weight=class_weight_dict,
    # random_state=42
)

rf_clf.fit(X_train_processed, y_train_encoded)

  rf_clf.fit(X_train_processed, y_train_encoded)


In [18]:
# preprocess & make predictions for test data
X_test_transformed  = preprocess_pipeline.transform(X_test)
preds_y_test_rf = rf_clf.predict(X_test_transformed)

y_test_transformed = label_ecoder.transform(y_test)

# print(classification_report(y_test_transformed, preds_y_test_rf))
rf_clf_classification_report = classification_report(y_test_transformed, preds_y_test_rf)
print(rf_clf_classification_report)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1582
           1       0.88      0.92      0.90      1749
           2       0.86      0.89      0.87      1644
           3       0.92      0.84      0.88       161
           4       0.92      0.93      0.92       438
           5       0.00      0.00      0.00         2
           6       0.58      0.26      0.35        86
           7       0.82      0.76      0.79       229
           8       0.73      0.70      0.72       135
           9       0.94      0.91      0.92        80
          10       0.89      0.91      0.90       696
          11       0.77      0.65      0.71       130
          12       0.50      0.22      0.31         9
          13       0.92      0.85      0.88       774
          14       1.00      0.85      0.92        59
          15       0.76      0.83      0.79        30
          16       1.00      0.33      0.50         3
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Deeper analysis of RandomForest Classifier

In [14]:
# Create df with y_test, preds_y_test & proba_preds_y_test 

# prediction
preds_y_test_rf = rf_clf.predict(X_test_transformed)
preds_y_test_inverse = label_ecoder.inverse_transform(preds_y_test_rf)

# get proba of predicted class (determine max proba value from all classes per row)
preds_y_test_rf = rf_clf.predict_proba(X_test_transformed)
df_proba = pd.DataFrame(preds_y_test_rf, columns=rf_clf.classes_)
df_proba['predicted_class_proba'] = df_proba.max(axis=1)
proba_preds_y_test = df_proba['predicted_class_proba'].values

y_test_dict = {
    'y_test': y_test.to_numpy(),
    'preds_y_test': preds_y_test_inverse,
    'proba_preds_y_test': proba_preds_y_test,
}

df_y_test = pd.DataFrame(y_test_dict)
df_y_test.head(30)

Unnamed: 0,y_test,preds_y_test,proba_preds_y_test
0,P06 – TightPack,P06 – TightPack,1.0
1,B02 Blister and Insert Card,B02 Blister and Insert Card,1.0
2,P02 Case,P02 Case,0.67
3,B01 - Blister and sealed blist,B02 Blister and Insert Card,0.35
4,P03 Tube,P03 Tube,0.87
5,P06 – TightPack,B01 - Blister and sealed blist,0.77
6,B02 Blister and Insert Card,B02 Blister and Insert Card,0.75
7,P03 Tube,P03 Tube,1.0
8,C01 - Folding carton,C01 - Folding carton,1.0
9,P03 Tube,P03 Tube,1.0


In [None]:
df_y_test.loc[
    df_y_test.y_test == 'H1 Wooden box'
].head(30)

Unnamed: 0,y_test,preds_y_test,proba_preds_y_test
7282,H1 Wooden box,B02 Blister and Insert Card,0.237
7350,H1 Wooden box,C06 Skincard,0.239
10582,H1 Wooden box,B02 Blister and Insert Card,0.245


# Threshold analyis

In [24]:
df_y_test_threshold = df_y_test

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold.y_test, df_y_test_threshold.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.93      0.91      0.92      1582
   B02 Blister and Insert Card       0.88      0.92      0.90      1749
          C01 - Folding carton       0.86      0.89      0.87      1644
                 C02 Trap Card       0.92      0.84      0.88       161
         C03 Trap Folding Card       0.92      0.93      0.92       438
            C04 Book packaging       0.00      0.00      0.00         2
               C05 Tray Packer       0.58      0.26      0.35        86
                  C06 Skincard       0.82      0.76      0.79       229
C07 Cardb. Sleeve w - w/o Shr.       0.73      0.70      0.72       135
C08 - Cardboard hanger w/o bag       0.94      0.91      0.92        80
        C09 - Paperboard pouch       0.89      0.91      0.90       696
  C10 - Carton cover (Lid box)       0.77      0.65      0.71       130
 C11 - Carton tube with or w/o       0.50      0.22      0.31  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
df_classification_report = pd.DataFrame(rf_clf_classification_report).transpose()

In [27]:
df_classification_report.head()

Unnamed: 0,precision,recall,f1-score,support
B01 - Blister and sealed blist,0.93173,0.905815,0.91859,1582.0
B02 Blister and Insert Card,0.876159,0.918239,0.896706,1749.0
C01 - Folding carton,0.864095,0.885645,0.874737,1644.0
C02 Trap Card,0.918919,0.84472,0.880259,161.0
C03 Trap Folding Card,0.920635,0.926941,0.923777,438.0


## threshold per class  
exclude classes that are under a specific threshold. Simulation of 4 thresholds: [80, 85, 90, 95]

In [37]:
# threshhold 80
packaging_categories_above_80 = df_classification_report.loc[
    df_classification_report['f1-score'] >= 0.80
].index

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.y_test.isin(packaging_categories_above_80)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.94      0.91      0.92      1582
   B02 Blister and Insert Card       0.90      0.92      0.91      1749
          C01 - Folding carton       0.90      0.89      0.89      1644
                 C02 Trap Card       0.92      0.84      0.88       161
         C03 Trap Folding Card       0.92      0.93      0.93       438
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       0.00      0.00      0.00         0
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.94      0.91      0.92        80
        C09 - Paperboard pouch       0.90      0.91      0.91       696
  C10 - Carton cover (Lid box)       0.00      0.00      0.00         0
 C11 - Carton tube with or w/o       0.00      0.00      0.00         0
       C12 - Corrugated carton       0.95      0.85      0.90  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# threshhold 85
packaging_categories_above_85 = df_classification_report.loc[
    df_classification_report['f1-score'] >= 0.85
].index

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.y_test.isin(packaging_categories_above_85)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.94      0.91      0.92      1582
   B02 Blister and Insert Card       0.90      0.92      0.91      1749
          C01 - Folding carton       0.90      0.89      0.89      1644
                 C02 Trap Card       0.92      0.84      0.88       161
         C03 Trap Folding Card       0.92      0.93      0.93       438
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       0.00      0.00      0.00         0
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.94      0.91      0.92        80
        C09 - Paperboard pouch       0.91      0.91      0.91       696
  C10 - Carton cover (Lid box)       0.00      0.00      0.00         0
 C11 - Carton tube with or w/o       0.00      0.00      0.00         0
       C12 - Corrugated carton       0.95      0.85      0.90  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# threshhold 90
packaging_categories_above_90 = df_classification_report.loc[
    df_classification_report['f1-score'] >= 0.90
].index

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.y_test.isin(packaging_categories_above_90)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.97      0.91      0.94      1582
   B02 Blister and Insert Card       0.00      0.00      0.00         0
          C01 - Folding carton       0.00      0.00      0.00         0
                 C02 Trap Card       0.00      0.00      0.00         0
         C03 Trap Folding Card       0.96      0.93      0.94       438
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       0.00      0.00      0.00         0
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.97      0.91      0.94        80
        C09 - Paperboard pouch       0.00      0.00      0.00         0
  C10 - Carton cover (Lid box)       0.00      0.00      0.00         0
 C11 - Carton tube with or w/o       0.00      0.00      0.00         0
       C12 - Corrugated carton       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
# threshhold 95
packaging_categories_above_95 = df_classification_report.loc[
    df_classification_report['f1-score'] >= 0.95
].index

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.y_test.isin(packaging_categories_above_95)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.00      0.00      0.00         0
   B02 Blister and Insert Card       0.00      0.00      0.00         0
          C01 - Folding carton       0.00      0.00      0.00         0
                  C06 Skincard       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.00      0.00      0.00         0
 P01 - Plastic bag with header       0.00      0.00      0.00         0
                      P02 Case       0.00      0.00      0.00         0
                      P03 Tube       0.00      0.00      0.00         0
              P04 Hanger/ Clip       1.00      0.99      0.99      2709
             P05 Plastic Pouch       0.00      0.00      0.00         0
               P06 – TightPack       1.00      0.96      0.98      1659
          P08 Plastic Cassette       0.00      0.00      0.00         0
                 U0 – Unpacked       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## threshold per SKU
exclude classes that are under a specific threshold. Simulation of 4 thresholds: [80, 85, 90, 95]

In [45]:
df_y_test_threshold.head()

Unnamed: 0,y_test,preds_y_test,proba_preds_y_test
0,P06 – TightPack,P06 – TightPack,1.0
1,B02 Blister and Insert Card,B02 Blister and Insert Card,1.0
2,P02 Case,P02 Case,0.67
3,B01 - Blister and sealed blist,B02 Blister and Insert Card,0.35
4,P03 Tube,P03 Tube,0.87


In [46]:
# threshhold 80

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.proba_preds_y_test >= 0.80
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.98      0.98      0.98      1254
   B02 Blister and Insert Card       0.98      0.99      0.98      1250
          C01 - Folding carton       0.97      0.96      0.97      1078
                 C02 Trap Card       0.98      0.95      0.96       125
         C03 Trap Folding Card       0.93      0.97      0.95       336
               C05 Tray Packer       0.69      0.95      0.80        21
                  C06 Skincard       0.99      0.98      0.99       126
C07 Cardb. Sleeve w - w/o Shr.       0.93      0.91      0.92        69
C08 - Cardboard hanger w/o bag       0.96      0.96      0.96        53
        C09 - Paperboard pouch       0.96      0.97      0.96       580
  C10 - Carton cover (Lid box)       0.89      0.70      0.78        79
 C11 - Carton tube with or w/o       0.00      0.00      0.00         1
       C12 - Corrugated carton       0.96      0.97      0.97  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
# threshhold 85

df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.proba_preds_y_test >= 0.85
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.98      0.98      0.98      1224
   B02 Blister and Insert Card       0.98      0.99      0.98      1225
          C01 - Folding carton       0.98      0.97      0.97      1048
                 C02 Trap Card       0.97      0.96      0.97       119
         C03 Trap Folding Card       0.93      0.98      0.95       323
               C05 Tray Packer       0.69      1.00      0.82        20
                  C06 Skincard       1.00      0.98      0.99       122
C07 Cardb. Sleeve w - w/o Shr.       0.95      0.92      0.94        65
C08 - Cardboard hanger w/o bag       0.96      0.96      0.96        53
        C09 - Paperboard pouch       0.96      0.97      0.96       557
  C10 - Carton cover (Lid box)       0.90      0.70      0.79        77
       C12 - Corrugated carton       0.97      0.97      0.97       618
                  C13 Envelope       1.00      1.00      1.00  

In [49]:
# threshhold 90
df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.proba_preds_y_test >= 0.90
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.98      0.99      0.98      1186
   B02 Blister and Insert Card       0.99      0.99      0.99      1183
          C01 - Folding carton       0.98      0.98      0.98      1015
                 C02 Trap Card       0.97      0.95      0.96       110
         C03 Trap Folding Card       0.94      0.98      0.96       309
               C05 Tray Packer       0.70      1.00      0.83        19
                  C06 Skincard       1.00      0.99      1.00       117
C07 Cardb. Sleeve w - w/o Shr.       0.95      0.96      0.96        56
C08 - Cardboard hanger w/o bag       1.00      0.98      0.99        51
        C09 - Paperboard pouch       0.97      0.98      0.97       509
  C10 - Carton cover (Lid box)       0.94      0.71      0.81        72
       C12 - Corrugated carton       0.97      0.98      0.98       605
                  C13 Envelope       1.00      1.00      1.00  

In [48]:
# threshhold 95
df_y_test_threshold_X = df_y_test_threshold.loc[
    df_y_test_threshold.proba_preds_y_test >= 0.95
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.98      0.99      0.99      1131
   B02 Blister and Insert Card       0.99      0.99      0.99      1119
          C01 - Folding carton       0.99      0.98      0.99       965
                 C02 Trap Card       0.98      0.96      0.97       103
         C03 Trap Folding Card       0.94      0.98      0.96       295
               C05 Tray Packer       0.69      1.00      0.82        18
                  C06 Skincard       1.00      0.99      1.00       116
C07 Cardb. Sleeve w - w/o Shr.       0.98      1.00      0.99        47
C08 - Cardboard hanger w/o bag       1.00      0.98      0.99        47
        C09 - Paperboard pouch       0.98      0.99      0.98       469
  C10 - Carton cover (Lid box)       0.96      0.75      0.84        60
       C12 - Corrugated carton       0.98      0.99      0.99       580
                  C13 Envelope       1.00      1.00      1.00  

## threshold per SKU for a specific subset of class (defined by Natalia)
exclude classes that are under a specific threshold. Simulation of 4 thresholds: [80, 85, 90, 95]

In [58]:
# threshhold 80
sub_classes = [
    'B02 Blister and Insert Card', 'P04 Hanger/ Clip', 'P08 Plastic Cassette', 'P03 Tube', 'B01 - Blister and sealed blist', 'P02 Case', 'P06 – TightPack',
    'P10 - Shrink film and insert o', 'P05 Plastic Pouch', 'P07 Plastic Box', 'P01 - Plastic bag with header', 'C06 Skincard', 'P11 - Plastic ring'
]

df_y_test_threshold_X = df_y_test_threshold.loc[
    (df_y_test_threshold.y_test.isin(sub_classes)) &
    (df_y_test_threshold.proba_preds_y_test >= 0.80)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.99      0.98      0.98      1254
   B02 Blister and Insert Card       0.99      0.99      0.99      1250
          C01 - Folding carton       0.00      0.00      0.00         0
                 C02 Trap Card       0.00      0.00      0.00         0
         C03 Trap Folding Card       0.00      0.00      0.00         0
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       0.99      0.98      0.99       126
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.00      0.00      0.00         0
        C09 - Paperboard pouch       0.00      0.00      0.00         0
 P01 - Plastic bag with header       0.98      0.97      0.98       264
                      P02 Case       0.97      0.91      0.94        35
                      P03 Tube       0.99      0.96      0.98  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
# threshhold 85
sub_classes = [
    'B02 Blister and Insert Card', 'P04 Hanger/ Clip', 'P08 Plastic Cassette', 'P03 Tube', 'B01 - Blister and sealed blist', 'P02 Case', 'P06 – TightPack',
    'P10 - Shrink film and insert o', 'P05 Plastic Pouch', 'P07 Plastic Box', 'P01 - Plastic bag with header', 'C06 Skincard', 'P11 - Plastic ring'
]

df_y_test_threshold_X = df_y_test_threshold.loc[
    (df_y_test_threshold.y_test.isin(sub_classes)) &
    (df_y_test_threshold.proba_preds_y_test >= 0.85)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.99      0.98      0.98      1224
   B02 Blister and Insert Card       0.99      0.99      0.99      1225
          C01 - Folding carton       0.00      0.00      0.00         0
                 C02 Trap Card       0.00      0.00      0.00         0
         C03 Trap Folding Card       0.00      0.00      0.00         0
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       1.00      0.98      0.99       122
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
C08 - Cardboard hanger w/o bag       0.00      0.00      0.00         0
        C09 - Paperboard pouch       0.00      0.00      0.00         0
 P01 - Plastic bag with header       0.99      0.98      0.98       255
                      P02 Case       1.00      0.94      0.97        32
                      P03 Tube       1.00      0.97      0.98  

  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# threshhold 90
sub_classes = [
    'B02 Blister and Insert Card', 'P04 Hanger/ Clip', 'P08 Plastic Cassette', 'P03 Tube', 'B01 - Blister and sealed blist', 'P02 Case', 'P06 – TightPack',
    'P10 - Shrink film and insert o', 'P05 Plastic Pouch', 'P07 Plastic Box', 'P01 - Plastic bag with header', 'C06 Skincard', 'P11 - Plastic ring'
]

df_y_test_threshold_X = df_y_test_threshold.loc[
    (df_y_test_threshold.y_test.isin(sub_classes)) &
    (df_y_test_threshold.proba_preds_y_test >= 0.90)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.99      0.99      0.99      1186
   B02 Blister and Insert Card       0.99      0.99      0.99      1183
          C01 - Folding carton       0.00      0.00      0.00         0
                 C02 Trap Card       0.00      0.00      0.00         0
         C03 Trap Folding Card       0.00      0.00      0.00         0
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       1.00      0.99      1.00       117
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
        C09 - Paperboard pouch       0.00      0.00      0.00         0
 P01 - Plastic bag with header       1.00      0.97      0.99       239
                      P02 Case       1.00      0.93      0.97        30
                      P03 Tube       1.00      0.98      0.99      1702
              P04 Hanger/ Clip       1.00      1.00      1.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
# threshhold 95
sub_classes = [
    'B02 Blister and Insert Card', 'P04 Hanger/ Clip', 'P08 Plastic Cassette', 'P03 Tube', 'B01 - Blister and sealed blist', 'P02 Case', 'P06 – TightPack',
    'P10 - Shrink film and insert o', 'P05 Plastic Pouch', 'P07 Plastic Box', 'P01 - Plastic bag with header', 'C06 Skincard', 'P11 - Plastic ring'
]

df_y_test_threshold_X = df_y_test_threshold.loc[
    (df_y_test_threshold.y_test.isin(sub_classes)) &
    (df_y_test_threshold.proba_preds_y_test >= 0.95)
]

def custom_class_report(y_true, y_preds):
    report = classification_report(y_true, y_preds, output_dict=True)
    print(classification_report(y_true, y_preds))
    return report

rf_clf_classification_report = custom_class_report(df_y_test_threshold_X.y_test, df_y_test_threshold_X.preds_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                precision    recall  f1-score   support

B01 - Blister and sealed blist       0.99      0.99      0.99      1131
   B02 Blister and Insert Card       1.00      0.99      0.99      1119
          C01 - Folding carton       0.00      0.00      0.00         0
                 C02 Trap Card       0.00      0.00      0.00         0
         C03 Trap Folding Card       0.00      0.00      0.00         0
               C05 Tray Packer       0.00      0.00      0.00         0
                  C06 Skincard       1.00      0.99      1.00       116
C07 Cardb. Sleeve w - w/o Shr.       0.00      0.00      0.00         0
        C09 - Paperboard pouch       0.00      0.00      0.00         0
 P01 - Plastic bag with header       1.00      0.98      0.99       230
                      P02 Case       1.00      1.00      1.00        21
                      P03 Tube       1.00      0.98      0.99      1605
              P04 Hanger/ Clip       1.00      1.00      1.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
