<a href="https://colab.research.google.com/github/tomhyhan/noodles/blob/main/pasta_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import userdata
from google.colab import auth

auth.authenticate_user()
token = userdata.get('token')
username = userdata.get('username')
repo_name = "noodles"

In [3]:
import os

!git clone https://github.com/tomhyhan/{repo_name}.git

if os.getcwd() != f"/content/{repo_name}":
    %cd {repo_name}
!git pull
!pwd


fatal: destination path 'noodles' already exists and is not an empty directory.
/content/noodles
Already up to date.
/content/noodles


In [4]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import sys

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = "noodles"

GOOGLE_DRIVE_PATH = os.path.join("..", "drive", "My Drive", GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)


In [6]:

from model.utils import test_colab
test_colab()


Hello Colab! from Noodles ha


In [7]:
!pip install torchinfo
!pip install imagehash



In [8]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold


from config.config_manager import ConfigManager
from collections import Counter
from model.data_model import PastaData, create_train_transforms, create_test_transforms
from model.train import trainer, create_model
from model.utils import reset_seed, save_model
from model.data import CLASS_ENCODER, create_csv
from model.viz import class_imbalance, draw_loss, draw_train_val_accuracy

  check_for_updates()


In [9]:
config_manager = ConfigManager("./config/config.yml")

SEED = config_manager.config.seed

In [10]:
reset_seed(SEED)

In [11]:
image_path = os.path.join(GOOGLE_DRIVE_PATH, "images")
csv_file_path = os.path.join(GOOGLE_DRIVE_PATH, "pasta_data.csv")

if not os.path.exists(csv_file_path):
    create_csv(image_path, csv_file_path)

In [12]:
data = pd.read_csv(os.path.join(GOOGLE_DRIVE_PATH, "./pasta_data.csv"))
image_paths, labels = data["img_path"], data["label"]

X, test_data, y, test_label = train_test_split(image_paths.values, labels.values, train_size=0.9, random_state=SEED, shuffle=True, stratify=labels)

In [25]:
k_fold = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)
models = ['regnet', 'convnext']

# N=100
# perm_indices = np.random.permutation(N)
# X = X[perm_indices]
# y = y[perm_indices]

for model_name in models:
    current_model_accuracies = []
    for k_id, (train_i, val_i) in enumerate(k_fold.split(X,y)):
        print(len(train_i), len(val_i))
        train_set = X[train_i]
        train_label_set = y[train_i]

        val_set = X[val_i]
        val_label_set = y[val_i]

        model_config = config_manager.config[model_name]
        lr = config_manager.config.lr
        num_epochs = config_manager.config.num_epochs
        num_classes = config_manager.config.num_classes

        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        train_transform = create_train_transforms()
        test_transform = create_test_transforms()

        train_data = PastaData(train_set, train_label_set, transform_list=train_transform)
        train_data_accuracy = PastaData(train_set, train_label_set, transform_list=test_transform)
        val_data = PastaData(val_set, val_label_set, transform_list=test_transform)

        train_loader = DataLoader(train_data, batch_size=model_config.batch_size, shuffle=False, drop_last=True)
        train_loader_accuracy = DataLoader(train_data_accuracy, batch_size=model_config.batch_size, shuffle=False, drop_last=True)
        val_loader = DataLoader(val_data, batch_size=model_config.batch_size, shuffle=False, drop_last=True)

        model = create_model(model_name, num_classes)
        model.to(device)

        result_dir = os.path.join(GOOGLE_DRIVE_PATH, model_config.output_dir)
        os.makedirs(result_dir, exist_ok=True)

        out_file = os.path.join(result_dir, f"{model_name}_k_fold_{k_id}.pt")

        resume = False
        if os.path.exists(out_file):
            print(f"=== {out_file} exist! ===")
            resume = True

        #  need to make dataframe from images
        loss_history, train_accuracy_history, \
        val_accuracy_history, best_accuracy, model, optimizer, scaler, end_epoch = trainer(
            model,
            train_loader,
            train_loader_accuracy,
            val_loader,
            num_epochs=num_epochs,
            lr=lr,
            model_config=model_config,
            device=device,
            resume=resume,
            out_file=out_file,
            num_classes=num_classes
        )


        current_model_accuracies.append(best_accuracy)
        print(f"End of Training for {model_name} Model {k_id}-fold")
        print(f"best accuracy: {best_accuracy}")

        save_model(model, optimizer, scaler, end_epoch, out_file, best_accuracy)
        # torch.save(best_params, os.path.join(result_dir, f"{model_name}_k_fold_{k_id}.pt"))
        draw_loss(loss_history)
        draw_train_val_accuracy(train_accuracy_history, val_accuracy_history)

    print("current_model_accuracies", current_model_accuracies)


        # avg_accuracy = torch.mean(fold_accuracies)
        # std_accuracy = torch.std(fold_accuracies)

        # general plan
        # 1. save best accuracies
        # 2. compare with different models
        # 3. train on full dataset with best performing model
        # 4. make inference


5032 1259
=== ../drive/My Drive/noodles/./regnet/regnet_k_fold_0.pt exist! ===
Resume Training from previous check point


  checkpoint = torch.load(out_file, map_location="cpu")


Epoch 16/20:   0%|          | 0/150 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [23]:
from pathlib import Path
import glob

In [24]:
models = ['regnet', 'convnext']
for model_name in models:
    model_config = config_manager.config[model_name]
    result_dir = os.path.join(GOOGLE_DRIVE_PATH, model_config.output_dir)
    output_dir = Path(result_dir)
    checkpt_paths =  glob.glob(os.path.join(output_dir, f"{model_name}_k_fold_*.pt"))
    accuracy_list = []
    epochs = []
    for checkpt_path in checkpt_paths:
        checkpoint = torch.load(checkpt_path, map_location='cpu')
        accuracy_list.append(checkpoint["best_accuracy"])
        epochs.append(checkpoint["epoch"] + 1)
    print("Model: ", model_name)
    print("epochs: ", epochs)
    print("Accuracy list: ",  accuracy_list)
    print("Accuracy Mean: ", np.mean(accuracy_list))
    print("Accuracy std: ", np.std(accuracy_list))

  checkpoint = torch.load(checkpt_path, map_location='cpu')


Model:  regnet
epochs:  [15, 15, 15, 15, 15]
Accuracy list:  [0.8733333333333333, 0.8816666666666667, 0.875, 0.8633333333333333, 0.8708333333333333]
Accuracy Mean:  0.8728333333333333
Accuracy std:  0.00595352369983061
Model:  convnext
epochs:  [15, 15, 15, 15, 15]
Accuracy list:  [0.8621848739495799, 0.8747899159663866, 0.8831932773109243, 0.8680672268907563, 0.873109243697479]
Accuracy Mean:  0.8722689075630253
Accuracy std:  0.0070106398704585324


In [17]:
# final model training
# X, test_data, y, test_label
train_set = X
train_label_set = y

val_set = test_data
val_label_set = test_label
print(len(X), len(test_data))
model_name = "regnet"

model_config = config_manager.config[model_name]
lr = config_manager.config.lr
num_epochs = config_manager.config.num_epochs
num_classes = config_manager.config.num_classes
k_id = 7

device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_transform = create_train_transforms()
test_transform = create_test_transforms()

train_data = PastaData(train_set, train_label_set, transform_list=train_transform)
train_data_accuracy = PastaData(train_set, train_label_set, transform_list=test_transform)
val_data = PastaData(val_set, val_label_set, transform_list=test_transform)

train_loader = DataLoader(train_data, batch_size=model_config.batch_size, shuffle=False, drop_last=True)
train_loader_accuracy = DataLoader(train_data_accuracy, batch_size=model_config.batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val_data, batch_size=model_config.batch_size, shuffle=False, drop_last=True)

model = create_model(model_name, num_classes)
model.to(device)

result_dir = os.path.join(GOOGLE_DRIVE_PATH, model_config.output_dir)
os.makedirs(result_dir, exist_ok=True)

out_file = os.path.join(result_dir, f"{model_name}_k_fold_{k_id}.pt")

resume = False
if os.path.exists(out_file):
    print(f"=== {out_file} exist! ===")
    resume = True

#  need to make dataframe from images
loss_history, train_accuracy_history, \
val_accuracy_history, best_accuracy, model, optimizer, scaler, end_epoch = trainer(
    model,
    train_loader,
    train_loader_accuracy,
    val_loader,
    num_epochs=num_epochs,
    lr=lr,
    model_config=model_config,
    device=device,
    resume=resume,
    out_file=out_file,
    num_classes=num_classes,
    eval_mode=True
)


current_model_accuracies.append(best_accuracy)
print(f"End of Training for {model_name} Model {k_id}-fold")
print(f"best accuracy: {best_accuracy}")

save_model(model, optimizer, scaler, end_epoch, out_file, best_accuracy)
# torch.save(best_params, os.path.join(result_dir, f"{model_name}_k_fold_{k_id}.pt"))
draw_loss(loss_history)
draw_train_val_accuracy(train_accuracy_history, val_accuracy_history)


6291 699


Epoch 1/5:   0%|          | 0/188 [00:00<?, ?it/s]

Learning Rate: [7.630916099932272e-05]
Epoch 1 Loss: 2.3258697010101157 Train Accuracy: 0             Validation Accuracy: 0.825
Global gradient norm: 10.167697049912652


Epoch 2/5:   0%|          | 0/188 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.09 GiB. GPU 0 has a total capacity of 22.17 GiB of which 868.88 MiB is free. Process 1351265 has 21.31 GiB memory in use. Of the allocated memory 9.24 GiB is allocated by PyTorch, and 11.83 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Inference

In [16]:
!git pull
!git config --global user.email "tomhyhan@gmail.com"
!git config --global user.name username
!git add .
!git status
!git commit -m "updated trainer"
!git push https://{token}@github.com/{username}/{repo_name}.git

Already up to date.
On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   config/config.yml[m
	[32mmodified:   model/train.py[m

[main 20fb791] updated trainer
 2 files changed, 7 insertions(+), 6 deletions(-)
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 12 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 597 bytes | 597.00 KiB/s, done.
Total 6 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/tomhyhan/noodles.git
   6b22ad6..20fb791  main -> main
