# Project: DLBDSMTP01 – Project: From Model to Production

Task 2: Image classification for a refund department (spotlight: Batch processing)

## Steps planned:

- explore and cleanup the data
- prepare the data for training, validation and final testing
- train a simple model, using transfer learning
    - using MlFlow for tracking
    - make at least 3 different models to simulate a real process of model training exploration
- evaluate the models and take the best performing
- download and pack the best perofmrin model in an API, using docker for simple and flexible deployment
- provide a script, which reads in the data to be processed in batches every night
- do a final test on the testing data and evaluate final statistics

## First lets look at some data from the dataset

In [None]:
import numpy as np
import pandas as pd

# read in the labeled datasets. ignore bad lines as some have 11 isntead of 10 columns and lead to errors
# we can ignore them here, as we can spare those few training items
data = pd.read_csv("./data/fashion-dataset/styles.csv", on_bad_lines="skip")

# print some data out for exploration
print(data.shape[0])
print(data['subCategory'].unique())
print(data['masterCategory'].unique())

In [None]:
# lets see if we have torch setup for our GPU
import torch
torch.cuda.is_available()

## Data preparation

- check and prepare the training set into pandas dataframes
- make a training and testing split
- split some of the testing split for the final batch processing to test it later 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# setup my file paths first
data_folder = "data/fashion-dataset/"
styles_file = os.path.join(data_folder, "styles.csv")
images_folder = os.path.join(data_folder, "images/")

# Load the dataset, ignore the bad lines
styles_df = pd.read_csv(styles_file, sep=',', on_bad_lines='skip')

# Filter necessary columns, tho dropped not used columns to save some memory
columns_needed = ['id', 'masterCategory']
# even if we dont have any n/a columns, we still better keep this routine here
styles_df = styles_df[columns_needed].dropna()

# define our column for the labels, we can change if we want to use a different one
target_column = 'masterCategory'

# Add image paths to the DataFrame
styles_df['image_path'] = styles_df['id'].astype(str) + ".jpg"
styles_df['image_path'] = styles_df['image_path'].apply(lambda x: os.path.join(images_folder, x))

# Check if images exist, filter out missing ones, do security wise, we normally assume the dataset is complete
styles_df = styles_df[styles_df['image_path'].apply(os.path.exists)]

# Filter out classes with fewer than 2 samples, we need to do, as the splitting require it to be at least 2
# we should even filter out more as low amount for data will lead to bad predictions. we will see this in the final statistics for those classes
class_counts = styles_df[target_column].value_counts()
valid_classes = class_counts[class_counts >= 2].index
styles_df = styles_df[styles_df[target_column].isin(valid_classes)]

# Split data into train/test sets, do normal setup here
train_df, test_df = train_test_split(
    styles_df,
    test_size=0.2,
    stratify=styles_df[target_column],
    random_state=42
)

# Further split test set for batch processing testing
batch_test_df, test_df = train_test_split(
    test_df,
    test_size=0.5,
    stratify=test_df[target_column],
    random_state=42
)

# Save splits for later use
train_df.to_csv(os.path.join(data_folder, "train_split.csv"), index=False)
test_df.to_csv(os.path.join(data_folder, "test_split.csv"), index=False)
batch_test_df.to_csv(os.path.join(data_folder, "batch_test_split.csv"), index=False)

# show statistics what is left and how much we have in each
print(f"Training set: {len(train_df)} items")
print(f"Testing set: {len(test_df)} items")
print(f"Batch testing set: {len(batch_test_df)} items")


## Training now the models

- including here we log our experiements to mlflow, to select the best one later

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
import mlflow
import mlflow.pytorch
from PIL import Image
import json

# setup my file paths first, we do already, but if we do in different sessions it help avoid errors
data_folder = "data/fashion-dataset/"
train_file = os.path.join(data_folder, "train_split.csv")
test_file = os.path.join(data_folder, "test_split.csv")

# hyperparameter grid for simulated different model trainings
# parameters are choosed mostly random at this stage
CONFIGURATIONS = [
    {"model_name": "resnet18", "batch_size": 32, "learning_rate": 0.001, "epochs": 5},
    {"model_name": "resnet34", "batch_size": 16, "learning_rate": 0.0005, "epochs": 5},
    {"model_name": "resnet50", "batch_size": 32, "learning_rate": 0.0001, "epochs": 5}
]

# initialize MLflow for tracking
mlflow_uri = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment("Fashion Image Classification")

# setup a class to keep our data for easier access
# most functions are self explaning, so no need for comments in it
class FashionDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = row['image_path']
        label = row['masterCategory']
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label

# load the training data we prepared in previous step
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# ensure no missing or invalid entries
# as we establishes, it should have not such data, but be on save side, if we ever switch sets
if train_df['masterCategory'].isnull().any():
    train_df = train_df.dropna(subset=['masterCategory'])

class_names = train_df['masterCategory'].unique()
class_to_idx = {class_name: idx for idx, class_name in enumerate(class_names)}
train_df['label'] = train_df['masterCategory'].map(class_to_idx)

# transformations to have a certain dimention and size pattern for the tensors. tensorflow require this
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


# we setup a variable for the device, so we can map the data to be on the same device
# otherwise tensorflow throws errors. this should prevent it and make it less error prone
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# training through our varios previous defined setups
for config in CONFIGURATIONS:
    model_name = config["model_name"]
    batch_size = config["batch_size"]
    learning_rate = config["learning_rate"]
    epochs = config["epochs"]

    # prepare the dataset now for the reaining.
    train_dataset = FashionDataset(train_df, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # same for our testing data. we don´t need it, as we will get results already from tensorflow, but we re-validate it again on a different set we prepared
    test_dataset = FashionDataset(test_df, transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # model selection by our setup. if we want use other ones, we need to adjust here and in the configuration
    if model_name == "resnet18":
        model = models.resnet18(weights='IMAGENET1K_V1')
    elif model_name == "resnet34":
        model = models.resnet34(weights='IMAGENET1K_V1')
    elif model_name == "resnet50":
        model = models.resnet50(weights='IMAGENET1K_V1')
    else:
        raise ValueError(f"Unsupported model: {model_name}")
    
    # modify the final layer
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, len(class_names))

    # send model to GPU (or if not supported to cpu, but we want the GPU for performance)
    model = model.to(device)

    # loss and optimizer setup
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # start of the training
    with mlflow.start_run():
        # log parameters to MlFlow for this model training
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("epochs", epochs)

        # save the labels mapping as an artifact, we need them later in the prediction
        labels_path = "class_to_idx.json"
        with open(labels_path, "w") as f:
            json.dump(class_to_idx, f)
        mlflow.log_artifact(labels_path, artifact_path="model_artifacts")

        for epoch in range(epochs):
            model.train()
            running_loss = 0.0
            correct = 0
            total = 0

            for batch_idx, (images, labels) in enumerate(train_loader):
                # ensure labels are mapped and converted to tensors
                labels = [class_to_idx[str(label)] for label in labels]
                labels = torch.tensor(labels, dtype=torch.long, device=device)

                # ensure both images and labels are tensors before moving to GPU
                # if we don´t do this, it can result in the issue that the model we transfer learn is still on the cpu and failing the training
                images = images.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item() * images.size(0)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                # print every 100 batches, so we can see the progression
                # this one is long running so some feedback while training is helpful
                if batch_idx % 100 == 0:
                    print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

            epoch_loss = running_loss / len(train_loader.dataset)
            epoch_acc = correct / total

            # log updated metrics to the model in MlFlow for later evaluataion
            mlflow.log_metric("loss", epoch_loss, step=epoch)
            mlflow.log_metric("accuracy", epoch_acc, step=epoch)

            print(f"Model: {model_name}, Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}")

            # evaluate on the test set after each epoch, we have already the model reported ones, but we want an individual one on top
            model.eval()
            test_correct = 0
            test_total = 0
            with torch.no_grad():
                for images, labels in test_loader:
                    labels = [class_to_idx[str(label)] for label in labels]
                    labels = torch.tensor(labels, dtype=torch.long)

                    images = images.to(device)
                    labels = labels.to(device)

                    outputs = model(images)
                    _, predicted = outputs.max(1)
                    test_total += labels.size(0)
                    test_correct += predicted.eq(labels).sum().item()

            test_acc = test_correct / test_total
            mlflow.log_metric("test_accuracy", test_acc, step=epoch)

            print(f"Test Accuracy after Epoch {epoch+1}: {test_acc:.4f}")


            # log model
            mlflow.pytorch.log_model(model, "model")

print("Training complete. All models logged to MLflow.")


## Evaluate the best model

To do so, please go to the MlFlow UI and select the model by the best performance metric. 

For this training setup it was the resnet 50 model with an overwhelming accuracy. We regsitered the model under the name

`fashion-data-model`

## Download the model



In [6]:
import mlflow
import logging
from mlflow.artifacts import download_artifacts

# model to fetch and saving folder
model_name = "fashion-data-model"  
output_dir = "./model_artifacts"

# establish connection to MlFlow, where we want to download the model
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# fetch the latest registered model version
# there may better ways, like tags and versions too. in this project this is sufficient to get what we need
# adjustements may needed on this part in bigger or progressing projects
client = mlflow.MlflowClient()
logging.info(f"Fetching latest registered version of model '{model_name}'...")
registered_model = client.search_model_versions(f"name='{model_name}'")

if not registered_model:
    raise ValueError(f"No model versions found for '{model_name}'.")

# get the latest version based on creation timestamp
latest_version = max(registered_model, key=lambda x: x.creation_timestamp)
run_id = latest_version.run_id
logging.info(f"Latest model version: {latest_version.version}, Run ID: {run_id}")

# download the model artifact
logging.info("Downloading model artifact...")
model_uri = f"runs:/{run_id}/model"
model_path = download_artifacts(artifact_uri=model_uri, dst_path=output_dir)
logging.info(f"Model downloaded to: {model_path}")

# download the labels artifact
logging.info("Downloading labels artifact...")
labels_uri = f"runs:/{run_id}/model_artifacts"
labels_path = download_artifacts(artifact_uri=labels_uri, dst_path=output_dir)
logging.info(f"Labels downloaded to: {labels_path}")

logging.info("All artifacts downloaded successfully.")


## Build the dockerfile

*Ports and namings can be edited to match your local system*

in shell: 
`docker build --no-cache -t model-api-gpu .`

## Run the docker container

in shell: 
`docker run --gpus all -p 5001:5001 model-api-gpu`



## Prepare the images for the final test after the container running

For the final evaluation we will copy all images we have in the split of the validation set for the final evaluation.
We need to copy them here to the folder so we can test the setup

*alternatively the batch run can be packed also in a docker file and run from there, where we copy the images too*

In [None]:
import os
import shutil
import pandas as pd

# load the CSV file and setup the folders
data_folder = "data/fashion-dataset/"
batch_file = os.path.join(data_folder, "batch_test_split.csv")
images_folder = os.path.join(data_folder, "images/")

df = pd.read_csv(batch_file)

# create the 'batch_images' folder if it doesn't exist, we will save the images here
output_folder = 'batch_images'
os.makedirs(output_folder, exist_ok=True)

# iterate over each row in the CSV and copy the image
for index, row in df.iterrows():
    image_path = row['image_path']
    if os.path.exists(image_path):
        # get the image filename and create the new path
        filename = os.path.basename(image_path)
        new_path = os.path.join(output_folder, filename)
        
        # copy the image to the new folder
        shutil.copy(image_path, new_path)
        print(f'Copied {image_path} to {new_path}')
    else:
        print(f'Image not found: {image_path}')


## Run the script and code

in shell:
`py batch_processing.py`

This will let the results be processed locally and also test the API in docker already. as mentioned alternatively the whole script and files could be packed and run as a cron already in a Docker container

## Evaluation of the final run

*if used a different setup and not run the processing locally ones for testing, please adjust and copy the files to match it here*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# initiate the files and labels we need for a final evaluation. 
# mind that a batch must have run and files are copied in the correct place
GROUND_TRUTH_CSV = "data/fashion-dataset/styles.csv"  # path to the ground truth styles CSV, our original file with the correct labels
PREDICTIONS_CSV = "output/predictions.csv"  # ath to predictions CSV, that it the file we generated with the output of a batch
TARGET_COLUMN = "masterCategory"  # Column to evaluate the label, should match what we used in the first steps when preparing the data and training the model

def create_evaluation_visualizations(y_true, y_pred, output_dir="output"):
    """
    Create and save visualization charts for model evaluation results. It will create a graph from the data comparison and save it in the same folder as the predicted csv file is, if not provide an own folder
    
    Args:
        y_true: Array-like of true labels
        y_pred: Array-like of predicted labels
        output_dir: Directory to save the plots
    """
    # set style, better would be seaborn, but somehow it not accept the style, so we use the default one and setup some more styles on the elements later to make it better
    plt.style.use('default')
    
    # create figure with multiple subplots
    fig = plt.figure(figsize=(15, 10))
    gs = fig.add_gridspec(2, 2)
    
    # cnfusion matrix heatmap
    ax1 = fig.add_subplot(gs[0, 0])
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
    ax1.set_title('Confusion Matrix')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('True')
    
    # class distribution comparison
    ax2 = fig.add_subplot(gs[0, 1])
    df_comparison = pd.DataFrame({
        'True': pd.Series(y_true).value_counts(),
        'Predicted': pd.Series(y_pred).value_counts()
    }).fillna(0)
    df_comparison.plot(kind='bar', ax=ax2)
    ax2.set_title('Class Distribution: True vs Predicted', pad=15)
    ax2.set_xlabel('Classes')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=45)
    
    # accuracy by class
    ax3 = fig.add_subplot(gs[1, :])
    class_accuracy = {}
    for class_name in set(y_true):
        mask = y_true == class_name
        class_accuracy[class_name] = accuracy_score(y_true[mask], y_pred[mask])
    
    accuracy_df = pd.DataFrame.from_dict(class_accuracy, orient='index', 
                                       columns=['Accuracy'])
    accuracy_df.sort_values('Accuracy', ascending=True).plot(
        kind='barh', ax=ax3)
    ax3.set_title('Accuracy by Class')
    ax3.set_xlabel('Accuracy')
    
    # adjust layout and save
    plt.tight_layout()
    plt.savefig(f"{output_dir}/evaluation_visualization.png", 
                dpi=300, bbox_inches='tight')
    plt.close()

def evaluate_predictions(ground_truth_csv, predictions_csv, target_column):
    """
    simply compare the predictions from the pre-labeled file with the predicitons from the batch results. Using only images from the output 
    file, so we compare only the ones we let run and not all to get correct data
    """

    try:
        # load ground truth and predictions
        ground_truth = pd.read_csv(ground_truth_csv, sep=',', on_bad_lines='skip')
        predictions = pd.read_csv(predictions_csv)
        
        # ensure predictions file has required columns
        # it should have, but be save we check it again
        if 'filename' not in predictions or 'prediction' not in predictions:
            raise ValueError("Predictions file must contain 'filename' and 'prediction' columns.")
        
        # extract IDs from the prediction filenames
        predictions['id'] = predictions['filename'].str.replace(".jpg", "", regex=False).astype(int)
        
        # filter ground truth to only include IDs present in predictions
        ground_truth_filtered = ground_truth[ground_truth['id'].isin(predictions['id'])]
        
        # merge ground truth and predictions on 'id'
        merged_df = pd.merge(ground_truth_filtered, predictions, on="id", how="inner")
        
        # Check for unmatched IDs and report if we dont have an ID in both CSVs. Should not happen, but better is better
        unmatched_count = len(predictions) - len(merged_df)
        if unmatched_count > 0:
            print(f"Warning: {unmatched_count} predictions have no matching ground truth.")
        
        # extract true labels and predictions
        y_true = merged_df[target_column]
        y_pred = merged_df['prediction']
        
        # calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred, output_dict=False)
        
        # create visualizations
        create_evaluation_visualizations(y_true, y_pred)
        
        # print evaluation metrics
        print(f"Accuracy: {accuracy:.2%}")
        print("\nClassification Report:\n", report)
        
        # save detailed evaluation results
        merged_df['correct'] = merged_df[target_column] == merged_df['prediction']
        output_file = "output/evaluation_results.csv" #again we save to the output here. if needed can be adjusted or in future need to make one static folder variable at the top
        merged_df.to_csv(output_file, index=False)
        print(f"Detailed evaluation results saved to: {output_file}")
        print("Visualization plots saved to: output/evaluation_visualization.png")
        
    except FileNotFoundError as e:
        print(f"Error: Could not find input file - {e}")
    except pd.errors.EmptyDataError:
        print("Error: One of the input files is empty")
    except Exception as e:
        print(f"Error occurred during evaluation: {str(e)}")

# run evaluation
if __name__ == "__main__":
    evaluate_predictions(GROUND_TRUTH_CSV, PREDICTIONS_CSV, TARGET_COLUMN)

## Deployment of the batch

**2 ways here:**

1. copy the file on a server of choice and setup the crontab

2. create a Docker container and setup all there. Let a process copy the files into the folder for processing on the container
<br>
<br>

> Suggested crontab entry

`5 0 * * * /usr/bin/python3 /opt/scripts/batch_processing.py`

*Don´t forget to grant the correct file access permissions for the user running it*