# Train a diffusion model

In [1]:
from transformers import ImageGPTImageProcessor, ImageGPTForImageClassification
from PIL import Image
import requests
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Chosen Huggingface model - ImageGPT for Image Classification (pretrained weights)

Current obstacle - requires large amount of RAM

In [2]:
%%capture

image_processor = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small")
model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small",num_labels=1)


Some weights of the model checkpoint at openai/imagegpt-small were not used when initializing ImageGPTForImageClassification: ['lm_head.weight']
- This IS expected if you are initializing ImageGPTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ImageGPTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ImageGPTForImageClassification were not initialized from the model checkpoint at openai/imagegpt-small and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## load training and eval dataset (sampled from original csv)

In [3]:
info = pd.read_csv("../input/oxml-carinoma-classification/labels.csv")
training = info.sample(frac=0.05)
eval_set = info.sample(frac=0.2)

## load into dataloaders

In [4]:
training_images = [f"../input/oxml-carinoma-classification/img_{i}.png" for i in training["id"].tolist()]
eval_images = [f"../input/oxml-carinoma-classification/img_{i}.png" for i in eval_set["id"].tolist()]

In [5]:
training_processed = [image_processor(Image.open(f), return_tensors="pt") for f in training_images]
eval_processed = [image_processor(Image.open(f), return_tensors="pt") for f in eval_images]

In [6]:
train_dataloader = DataLoader(training_processed, shuffle=False, batch_size=1)
eval_dataloader = DataLoader(eval_processed, batch_size=1)

In [7]:
training_labels = training["malignant"].tolist()
eval_labels = eval_set["malignant"].tolist()

In [8]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

## Perform fine tuning on new dataset for 10 epochs

In [9]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [10]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

ImageGPTForImageClassification(
  (transformer): ImageGPTModel(
    (wte): Embedding(513, 512)
    (wpe): Embedding(1024, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x ImageGPTBlock(
        (ln_1): ImageGPTLayerNorm()
        (attn): ImageGPTAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): ImageGPTLayerNorm()
        (mlp): ImageGPTMLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): QuickGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): ImageGPTLayerNorm()
  )
  (score): Linear(in_features=512, out_features=1, bias=False)
)

In [11]:
import torch

In [12]:
total_input_ids = torch.cat([i["input_ids"] for i in training_processed])
total_labels = torch.Tensor(training_labels).float()
total_input_dict = {'input_ids':total_input_ids}

In [13]:
from tqdm.auto import tqdm
model.train()
for epoch in range(num_epochs):
    outputs = model(**total_input_dict,labels=total_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    print(f"Epoch:{epoch} Loss:{loss}")

Epoch:0 Loss:0.9652937054634094
Epoch:1 Loss:0.9772682189941406
Epoch:2 Loss:0.89573734998703
Epoch:3 Loss:0.8637697100639343
Epoch:4 Loss:0.894367516040802
Epoch:5 Loss:0.8953459858894348
Epoch:6 Loss:0.7891213297843933
Epoch:7 Loss:0.8180252909660339
Epoch:8 Loss:0.813873291015625
Epoch:9 Loss:0.7688423991203308


# obtain updated embeddings after fine tunings, fit and predict RF classifier

In [14]:
features = []
for i in training_processed:
    features.append(model(**i,output_hidden_states=True).hidden_states[0].detach().numpy())
features = [i.flatten("C") for i in features]

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

In [16]:
rf_model.fit(features,training_labels)

In [17]:
features = []
for i in eval_processed:
    features.append(model(**i,output_hidden_states=True).hidden_states[0].detach().numpy())
features = [i.flatten("F") for i in features]

In [18]:
outs = rf_model.predict(features)

## Obtain F1 score

In [19]:
from sklearn.metrics import f1_score

In [20]:
f1_score(eval_labels,outs,average="weighted")

0.5333333333333333