In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from fastai.imports import *

import torch
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from tqdm.notebook import tqdm

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
BASE_PATH = 'COMP5329S1A2Dataset'

In [3]:
def read_csv(path, n_columns=2):
    data = []
    with open(path, 'r') as f:
        for line in f.readlines():
            if not re.match('^\d+\.jpg', line):
                continue
            ImageID = line.split(',')[0]     
            if n_columns ==2:
                Labels = line.split(',')[1] 
                Caption = ','.join(line.split(',')[2:])
                data.append({'ImageID':ImageID, 'Labels':Labels, 'Caption': Caption})
            else:
                Caption = ','.join(line.split(',')[1:])
                data.append({'ImageID':ImageID, 'Labels': '' , 'Caption': Caption})
                
    return pd.DataFrame(data)

In [4]:
df = read_csv(f'{BASE_PATH}/train.csv').iloc[:,[-1, 1]]
df

Unnamed: 0,Caption,Labels
0,"""Woman in swim suit holding parasol on sunny day.""\n",1
1,"""A couple of men riding horses on top of a green field.""\n",1 19
2,"""They are brave for riding in the jungle on those elephants.""\n",1
3,"""a black and silver clock tower at an intersection near a tree""\n",8 3 13
4,"""A train coming to a stop on the tracks out side.""\n",8 3 7
...,...,...
29995,"""A picture of a truck that is in the middle of a road.""\n",8 1 2
29996,"""A plate topped with a pizza being cut with a spoon.""\n",1
29997,"""A man riding a snowboard on top of snow.""\n",1
29998,"""This photo shows people skiing in the mountains.""\n",1


In [5]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
def append_dummies(df):
	labels_df = df['Labels'].str.get_dummies(sep=' ')
	labels_df.columns = ['' + str(col) for col in labels_df.columns]

	return pd.concat([df.iloc[:,:-1], labels_df], axis=1)

In [7]:
train_df = append_dummies(train_df); display(train_df.head(3))
valid_df = append_dummies(valid_df)

Unnamed: 0,Caption,1,10,11,13,14,15,16,17,18,19,2,3,4,5,6,7,8,9
21753,"""A woman and a girl standing in the street talking to another woman through the window.""\n",1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
251,"""The group of people are playing video games together.""\n",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22941,"""A woman talking on a cell phone walking down a street.""\n",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
labels = [label for label in train_df if label not in ['Caption']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(*labels)

1 10 11 13 14 15 16 17 18 19 2 3 4 5 6 7 8 9


In [9]:
lmodel = "bert-base-cased"

In [10]:
train_ds = Dataset.from_pandas(train_df).remove_columns('__index_level_0__')
valid_ds = Dataset.from_pandas(valid_df).remove_columns('__index_level_0__')
#eval_ds = Dataset.from_pandas(eval_df)#.remove_columns('__index_level_0__')

In [11]:
dds = DatasetDict({"train":train_ds, "test": valid_ds})
dds

DatasetDict({
    train: Dataset({
        features: ['Caption', '1', '10', '11', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9'],
        num_rows: 24000
    })
    test: Dataset({
        features: ['Caption', '1', '10', '11', '13', '14', '15', '16', '17', '18', '19', '2', '3', '4', '5', '6', '7', '8', '9'],
        num_rows: 6000
    })
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained(lmodel)

def preprocess_data(examples):
  text = examples["Caption"]
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  labels_matrix = np.zeros((len(text), len(labels)))
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [13]:
encoded_dataset = dds.map(preprocess_data, batched=True, remove_columns=train_ds.column_names)

Map: 100%|██████████| 24000/24000 [00:04<00:00, 5357.87 examples/s]
Map: 100%|██████████| 6000/6000 [00:01<00:00, 5885.93 examples/s]


In [14]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [15]:
tokenizer.decode(example['input_ids'])

'[CLS] " A woman and a girl standing in the street talking to another woman through the window. " [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [16]:
encoded_dataset.set_format("torch")

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    lmodel, 
    problem_type="multi_label_classification", 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
batch_size = 128
metric_name = "f1"

In [19]:
args = TrainingArguments(
    f"{lmodel}",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [20]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [21]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7012, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.2679,  0.0922,  0.5501, -0.2036, -0.0258, -0.2577,  0.0274, -0.2879,
         -0.4027,  0.5006,  0.6700, -0.0777, -0.1441, -0.5612, -0.1269,  0.6982,
          0.1578, -0.1067]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [23]:
trainer.train()



KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
eval_df = read_csv(f'{BASE_PATH}/test.csv', 1).iloc[:,[0,-1]]

In [None]:
eval_df.head(4)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

class EvalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row.Caption
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        return encoding

eval_dataset = EvalDataset(eval_df, tokenizer)


batch_size = 264
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

progress_bar = tqdm(total=len(eval_dataloader), desc="Inference Progress")

all_probs = []
model.to(device)
model.eval()

with torch.no_grad():
    for batch in eval_dataloader:
        inputs = {key: value.squeeze().to(device) for key, value in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits.squeeze()  
        if len(logits.shape) == 0:
            batch_probs = [logits.item()]
        else:
            batch_probs = torch.sigmoid(logits).tolist()

        all_probs.extend(batch_probs)
        progress_bar.update(1)

        del inputs, outputs, logits

In [None]:
probs_df = pd.DataFrame(all_probs, columns = labels)
probs_df.head(3)

In [None]:
def create_labels_df(df, threshold=0.5):
    df = df.copy()
    labels = []
    for i in range(len(df)):
        label_list = [col for col in df.columns[1:] if df.iloc[i][col] > threshold]
        labels.append(" ".join(label_list))
    df["Labels"] = labels
    return df[["ImageID", "Labels"]]

In [None]:
final_preds = pd.concat([eval_df.iloc[:,:-1], probs_df], axis=1)

In [None]:
final_preds.head(5)