In [1]:
import os
import numpy as np
import pandas as pd
import PIL
import torch
from datasets import load_dataset, Dataset, Image
from transformers import AutoImageProcessor, AutoModelForImageClassification, Trainer, TrainingArguments
import evaluate
from sklearn.utils.class_weight import compute_class_weight
from tqdm.notebook import tqdm
import gc




In [2]:
data = pd.read_csv('train.csv')
data['image_name'] = data['image_name'].apply(lambda x: os.path.join('train', x))
#data['image'] = data['image_name'].apply(lambda x: os.path.join('train', x))
#data['label'] = data['class_id']
data = data.drop(columns=['unified_class'])
data

Unnamed: 0,class_id,image_name
0,5,train\3cf4207b958eade893a2f1618cf062b8.JPG
1,2,train\37698901280c871f426d40afe5c373cd.JPG
2,0,train\20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,train\a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,train\54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28010,5,train\07b420b4fe265b4ed918b46435c025d7.JPG
28011,6,train\2d1c5918357bbdd729bf79085e55d35e.JPG
28012,0,train\1531efa9f8687e390adf780355acd606.JPG
28013,1,train\2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [3]:
data = data[[len(np.array(PIL.Image.open(image)).shape) == 3 for image in tqdm(data['image_name'])]].reset_index(drop=True)
data

  0%|          | 0/28015 [00:00<?, ?it/s]

Unnamed: 0,class_id,image_name
0,5,train\3cf4207b958eade893a2f1618cf062b8.JPG
1,2,train\37698901280c871f426d40afe5c373cd.JPG
2,0,train\20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,train\a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,train\54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28009,5,train\07b420b4fe265b4ed918b46435c025d7.JPG
28010,6,train\2d1c5918357bbdd729bf79085e55d35e.JPG
28011,0,train\1531efa9f8687e390adf780355acd606.JPG
28012,1,train\2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [4]:
model_name = "microsoft/resnet-50"
#model_name = "google/vit-base-patch16-224"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name, num_labels=len(set(data['class_id'])), ignore_mismatched_sizes=True)
#processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
#model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([10, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def transforms(batch):
    inputs = processor(batch['image_name'], return_tensors='pt')
    inputs['pixel_values'] = inputs['pixel_values'].squeeze(0) #if dataset.map
    #inputs['labels'] = batch['class_id'] #if dataset.with_transform (low memory)
    #print(inputs)
    return inputs

In [6]:
dataset = Dataset.from_pandas(data)
dataset = dataset.rename_column('class_id', 'labels')
dataset = dataset.class_encode_column('labels')
dataset = dataset.cast_column('image_name', Image())
dataset = dataset.map(transforms, remove_columns=['image_name']) # if enough memory
dataset = dataset.with_format('torch') # if dataset.map
#dataset = dataset.with_transform(transforms) # if low memory
dataset = dataset.train_test_split(test_size=0.5, stratify_by_column='labels', seed=42)
del data
gc.collect()
dataset

Stringifying the column:   0%|          | 0/28014 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/28014 [00:00<?, ? examples/s]

Map:   0%|          | 0/28014 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 14007
    })
    test: Dataset({
        features: ['labels', 'pixel_values'],
        num_rows: 14007
    })
})

In [7]:
train_dataset, eval_dataset = dataset['train'], dataset['test']
del dataset
gc.collect()
train_dataset[0]

{'labels': tensor(5),
 'pixel_values': tensor([[[ 0.3994,  0.3823,  0.3481,  ...,  0.2796,  0.1939, -0.1657],
          [ 0.3138,  0.2796,  0.3652,  ...,  0.2453,  0.0912, -0.0116],
          [ 0.3994,  0.4166,  0.4508,  ...,  0.3309,  0.0741,  0.0741],
          ...,
          [ 0.5707,  0.5878,  0.6392,  ...,  2.0092,  1.9407,  1.9407],
          [ 0.6563,  0.7419,  0.9132,  ...,  2.0948,  2.0434,  1.8893],
          [ 0.7077,  0.7762,  0.8104,  ...,  2.1119,  2.0605,  1.9407]],
 
         [[ 0.5378,  0.5203,  0.4853,  ...,  0.4153,  0.3277, -0.0399],
          [ 0.4503,  0.4153,  0.5028,  ...,  0.3803,  0.2227,  0.1176],
          [ 0.5378,  0.5553,  0.5903,  ...,  0.4678,  0.2052,  0.2052],
          ...,
          [ 0.7129,  0.7304,  0.7829,  ...,  2.1835,  2.1134,  2.1134],
          [ 0.8004,  0.8880,  1.0630,  ...,  2.2710,  2.2185,  2.0609],
          [ 0.8529,  0.9230,  0.9580,  ...,  2.2885,  2.2360,  2.1134]],
 
         [[ 0.7576,  0.7402,  0.7054,  ...,  0.6356,  0.5485, 

In [8]:
training_args = TrainingArguments(
    output_dir='cache',
    learning_rate=3e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    #weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=3,
    #fp16=True #float16
)

In [9]:
metric = evaluate.load('f1')

In [10]:
def compute_metrics(output):
    logits, labels = output
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

weight = torch.tensor(compute_class_weight(class_weight='balanced', classes=np.unique(train_dataset['labels']), y=train_dataset['labels'].numpy()), dtype=torch.float, device=model.device)
loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 2 labels with different weights)
        #loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([8.0, 1.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
trainer = CustomTrainer( #Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,#train_dataset,#train_dataset,#['train'][:10],
    eval_dataset=eval_dataset,#eval_dataset#['test'],
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)
weight

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,#train_dataset,#train_dataset,#['train'][:10],
    eval_dataset=eval_dataset,#eval_dataset#['test'],
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()
torch.cuda.empty_cache()

Epoch,Training Loss,Validation Loss,F1
1,0.9267,0.352678,0.789173
2,0.2396,0.190422,0.898799
3,0.115,0.150967,0.923571
4,0.0631,0.136635,0.93244
5,0.0376,0.137192,0.934048


In [13]:
data = pd.read_csv('sample_submission.csv')
data['image_name'] = data['image_name'].apply(lambda x: os.path.join('test', x))
data = data[[len(np.array(PIL.Image.open(image)).shape) == 3 for image in tqdm(data['image_name'])]].reset_index(drop=True)
data

  0%|          | 0/12958 [00:00<?, ?it/s]

Unnamed: 0,image_name,predicted_class
0,test\cc27b9b56583a615fb8501e352402eb9.JPG,0
1,test\87872711fe672676fd34a97e997f9c47.JPG,0
2,test\424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,test\c5537eaa60525efd7bad4a5560607e83.JPG,0
4,test\e9f15b67ca49453e281b2b4f245eac13.JPG,0
...,...,...
12953,test\028668e733cd17ec9b9f1c7e2c657b36.JPG,0
12954,test\eb1f1152941fdfdd50ff9954010e622a.JPG,0
12955,test\bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,0
12956,test\2eaf9c794958a93bb9984441fd5d7f61.JPG,0


In [14]:
test_dataset = Dataset.from_pandas(data[['image_name']])
test_dataset = test_dataset.cast_column('image_name', Image())
test_dataset = test_dataset.map(transforms, remove_columns=['image_name']) # if enough memory
test_dataset = test_dataset.with_format('torch') # if dataset.map
gc.collect()
test_dataset

Map:   0%|          | 0/12958 [00:00<?, ? examples/s]

Dataset({
    features: ['pixel_values'],
    num_rows: 12958
})

In [15]:
predictions = trainer.predict(test_dataset).predictions.argmax(-1)
predictions

array([4, 5, 0, ..., 5, 6, 0], dtype=int64)

In [16]:
data['image_name'] = data['image_name'].apply(os.path.basename)
data['predicted_class'] = predictions
data.to_csv('submission.csv', index=False)
data

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,4
1,87872711fe672676fd34a97e997f9c47.JPG,5
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,1
4,e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,5
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,4
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,6
