**GENERAL THOUGHTS:**
- ...


**DATA PREPROCESSING:**

Imbalanced data:
- over_sampling for imbalanced data
- cost-sensitive learning for imbalanced data

categorical data:
- Ordinal Data: The categories have an inherent order
- Nominal Data: The categories do not have an inherent order



**MULTI-CLASS CLASSIFIER:**
- Focus on "Native Multiclass Classifiers" as a starting point. Might try "Binary Transformation" or "Hierarchical Classification" later. https://www.projectpro.io/article/multi-class-classification-python-example/547
- Overview models to be considered:  
  - [X] Decision Trees

In [1]:
import os
import pickle
from datetime import datetime

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.dummy import DummyClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
# import lightgbm as lgbm

import optuna
# from optuna.samplers import TPESampler

import imblearn
from imblearn.over_sampling import RandomOverSampler

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED = 42

clf_name = "dt_clf"

# Get current date and time
now = datetime.now()
# Format date and time
formatted_date_time = now.strftime("%Y-%m-%d_%H:%M:%S")
print(formatted_date_time)

2024-01-25_11:14:09


# Load data

In [5]:
df = pd.read_csv('../../data/output/df_ml.csv', sep='\t')

# Production pipeline

## Training pipeline

### Pipeline  
train model and pipeline on full dataset

In [15]:
# load full dataset (assinged & unassinged SKUs)
df_raw = pd.read_csv('../../data/data_showcase.csv', sep='\t')

# data cleaning (data formats, general transformations, "feature selection")
df_cleaned = df_raw.rename(columns={
    'Product Area': 'product_area',
    'Core Segment': 'core_segment',
    'Brand': 'brand',
    'Material Number': 'material_number',
    'Material No Text': 'material_number_text',
    'Component': 'component',
    'Material Description': 'component_text',
    'Packaging Code': 'packaging_code',
    'Material Characteristic': 'characteristic_value',
    'Material Weight': 'material_weight',
    'Column 21': 'col_21',
    'Weight measure': 'weight_measure',
    'Packaging Category': 'packaging_category',
    'Manufactoring Location': 'manufactoring_location',
    'Column 43': 'col_43'
})

df_cleaned['material_number'] = df_cleaned['material_number'].astype('object')
df_cleaned['packaging_category'] = df_cleaned['packaging_category'].astype('object')

df_cleaned['packaging_category'].mask(
    df_cleaned['packaging_category'].isin(['-', np.nan]), 'Unassigned', inplace=True
)

df_cleaned['packaging_category'].mask(
    df_cleaned['packaging_category'].isin(['No Packaging']), 'U0 – Unpacked', inplace=True
)

df_full_sub = df_cleaned[[
    'material_number',
    'brand',
    'product_area',
    'core_segment',
    'component',
    'manufactoring_location',
    'characteristic_value',
    'material_weight', 
    'packaging_code',
    'packaging_category',
]]

# TODO(optional): data quality checks (e.g. ensure features have the right format, size of input data, ...)


# final training data
# split data into "assigned" == X, and "unassinged" == X_prod
df_ml = df_full_sub[df_full_sub.packaging_category != 'Unassigned']
# Define features and target
X = df_ml.iloc[:, :-1]
y = df_ml.iloc[:, -1]  # the last column is the target
# NOTE: Oversampling so each class has at least 100 sample; to properly apply CV and evaluation
dict_oversmapling = {
    'Metal Cassette': 100,
    'Carton tube with or w/o': 100,
    'Wooden box': 100,
    'Fabric packaging': 100,
    'Book packaging': 100
}
# define oversampling strategy
oversampler = RandomOverSampler(sampling_strategy=dict_oversmapling, random_state=SEED)
# fit and apply the transform
X, y = oversampler.fit_resample(X, y)


# production data (for inference)
df_no_packaging_categories = df_full_sub[df_full_sub.packaging_category == 'Unassigned']
# Define features and target
X_inf = df_no_packaging_categories.iloc[:, :-1]
y_inf = df_no_packaging_categories.iloc[:, -1]  # the last column is the target

In [16]:
# FULL TRAINING PIPELINE

# DEFINE PREPROCESSING PIPELINE
# define numerical feature processing
numerical_features = X.select_dtypes(include='number').columns.tolist()
# print(f'There are {len(numerical_features)} numerical features:', '\n')
# print(numerical_features)
numeric_feature_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('log_transform', PowerTransformer()),
    # ('scale', MinMaxScaler())
])
# define categorical feature processing
categorical_features = X.select_dtypes(exclude='number').columns.tolist()
# print(f'There are {len(categorical_features)} categorical features:', '\n')
# print(categorical_features)
categorical_feature_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    # ('one_hot', OneHotEncoder(handle_unknown='ignore', max_categories=None, sparse=False))
])
# apply both pipeline on seperate columns using "ColumnTransformer"
preprocess_pipeline = ColumnTransformer(transformers=[
    ('number', numeric_feature_pipeline, numerical_features),
    ('category', categorical_feature_pipeline, categorical_features)
])
X_transformed = preprocess_pipeline.fit_transform(X)


# TARGET PIPELINE
label_ecoder = LabelEncoder()
y_transformed = label_ecoder.fit_transform(y)


# DEFINE MODEL PIPELINE
# calc class weights
class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_transformed),
    y=y_transformed
)
class_weight_dict = dict(enumerate(class_weights))
# model
best_params = {
    'max_depth': 50,
    'criterion': 'entropy'
}
dt_clf = DecisionTreeClassifier(
    # n_estimators=10,
    **best_params,
    class_weight=class_weight_dict,
    # random_state=SEED
)
# training
dt_clf.fit(X_transformed, y_transformed)

### save pipeline & model

In [17]:
# save pre-processing pipeline

# pipeline
final_preprocessing_pipeline = preprocess_pipeline

# get patch for ml model directory
path_ml_pipeline = "../../ml_pipelines"
# Check whether the specified path exists or not
isExist = os.path.exists(path_ml_pipeline)
if not isExist:
   # Create a new directory because it does not exist
   os.makedirs(path_ml_pipeline)

pickle.dump(final_preprocessing_pipeline, open(f'{path_ml_pipeline}/final_preprocessing_pipeline_{clf_name}_{formatted_date_time}.pkl', 'wb'))
pickle.dump(label_ecoder, open(f'{path_ml_pipeline}/final_label_ecoder_{clf_name}_{formatted_date_time}.pkl', 'wb'))

In [18]:
# save model

# model
best_model = dt_clf

# get patch for ml model directory
path_ml_pipeline = "../../ml_pipelines"
# Check whether the specified path exists or not
isExist = os.path.exists(path_ml_pipeline)
if not isExist:
   # Create a new directory because it does not exist
   os.makedirs(path_ml_pipeline)

# print('Model score:', best_model.score(X_test_scaled, y_test))
pickle.dump(best_model, open(f'{path_ml_pipeline}/best_model_{clf_name}_{formatted_date_time}.pkl', 'wb'))

## Inference pipeline

### Inference for SKUs with unassigned packaging categories

In [19]:
# load full dataset (assinged & unassinged SKUs)
df_raw = pd.read_csv('../../data/data_showcase.csv', sep='\t')

# data cleaning (data formats, general transformations, "feature selection")
df_cleaned = df_raw.rename(columns={
    'Product Area': 'product_area',
    'Core Segment': 'core_segment',
    'Brand': 'brand',
    'Material Number': 'material_number',
    'Material No Text': 'material_number_text',
    'Component': 'component',
    'Material Description': 'component_text',
    'Packaging Code': 'packaging_code',
    'Material Characteristic': 'characteristic_value',
    'Material Weight': 'material_weight',
    'Column 21': 'col_21',
    'Weight measure': 'weight_measure',
    'Packaging Category': 'packaging_category',
    'Manufactoring Location': 'manufactoring_location',
    'Column 43': 'col_43'
})

df_cleaned['material_number'] = df_cleaned['material_number'].astype('object')
df_cleaned['packaging_category'] = df_cleaned['packaging_category'].astype('object')

df_cleaned['packaging_category'].mask(
    df_cleaned['packaging_category'].isin(['-', np.nan]), 'Unassigned', inplace=True
)

df_cleaned['packaging_category'].mask(
    df_cleaned['packaging_category'].isin(['No Packaging']), 'U0 – Unpacked', inplace=True
)

df_full_sub = df_cleaned[[
    'material_number',
    'brand',
    'product_area',
    'core_segment',
    'component',
    'manufactoring_location',
    'characteristic_value',
    'material_weight', 
    'packaging_code',
    'packaging_category',
]]

In [20]:
df_inf = df_full_sub[df_full_sub.packaging_category == 'Unassigned']
# Define features and target
X_inf = df_inf.iloc[:, :-1]

In [21]:
# TODO: implement data quality check (corresponding to the pre-pipeline steps)

# load pre-processing pipeline
loaded_preprocessing_pipeline = pickle.load(open(f'{path_ml_pipeline}/final_preprocessing_pipeline_{clf_name}_{formatted_date_time}.pkl', 'rb'))
loaded_lable_encoder = pickle.load(open(f'{path_ml_pipeline}/final_label_ecoder_{clf_name}_{formatted_date_time}.pkl', 'rb'))

# load model
loaded_model = pickle.load(open(f'{path_ml_pipeline}/best_model_{clf_name}_{formatted_date_time}.pkl', 'rb'))                 

In [22]:
# inference
X_inf_transformed = loaded_preprocessing_pipeline.transform(X_inf)
preds_y_inf = loaded_model.predict(X_inf_transformed)
preds_y_inf_inverse = loaded_lable_encoder.inverse_transform(preds_y_inf)
# inference proba
preds_y_inf_proba = loaded_model.predict_proba(X_inf_transformed)

In [23]:
# df only containung unassigned; alternative: create 3 df (output files): 1. full as above (assinged & unassinged), 2. only containing assinged 3. only containung unassigned
df_unassigned_SKUs_with_predicted_classes = pd.DataFrame(X_inf.material_number)
df_unassigned_SKUs_with_predicted_classes['predected_packaging_categories'] = preds_y_inf_inverse
df_unassigned_SKUs_with_predicted_classes['predected_packaging_categories_probabilities'] = preds_y_inf_proba.max(axis=1)

In [24]:
df_unassigned_SKUs_with_predicted_classes.head()

Unnamed: 0,material_number,predected_packaging_categories,predected_packaging_categories_probabilities
366,77095609,Cardb. Sleeve w - w/o Shr.,1.0
367,77095609,Unpacked,1.0
368,77095609,Shrink film and insert o,1.0
787,53683705,Folding carton,1.0
800,59950025,Metal Cassette,1.0


In [25]:
print('mean: ', df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities.mean())
print('count: ', df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities.count())

mean:  1.0
count:  7058


In [26]:
print('mean: ',
    df_unassigned_SKUs_with_predicted_classes.loc[
        df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities >= 0.50
    ].predected_packaging_categories_probabilities.mean()
)
print('count: ',
    df_unassigned_SKUs_with_predicted_classes.loc[
        df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities >= 0.50
    ].predected_packaging_categories_probabilities.count()
)

mean:  1.0
count:  7058


In [27]:
print('mean: ',
    df_unassigned_SKUs_with_predicted_classes.loc[
        df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities >= 0.95
    ].predected_packaging_categories_probabilities.mean()
)
print('count: ',
    df_unassigned_SKUs_with_predicted_classes.loc[
        df_unassigned_SKUs_with_predicted_classes.predected_packaging_categories_probabilities >= 0.95
    ].predected_packaging_categories_probabilities.count()
)

mean:  1.0
count:  7058


In [28]:
# save predictions for unassigned SKUs as excel
# df_unassigned_SKUs_with_predicted_classes.to_excel(f'../../data/output/unassigned_SKUs_with_{clf_name}_{formatted_date_time}_predicted_classes.xlsx', sheet_name='dt_predictions_unassigned_SKUs', index=False, header=True)