In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
path =  "/content/drive/My Drive/"

In [8]:
import pandas as pd
import numpy as np

In [9]:
data = pd.read_csv(path + 'train_data.csv')
data_test = pd.read_csv(path + 'test_data.csv')

In [10]:
data.head()

Unnamed: 0,ID,UTTERANCES,IOB SLOT TAGS
0,0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie
1,1,show credits for the godfather,O O O B_movie I_movie
2,2,who was the main actor in the exorcist,O O O O O O B_movie I_movie
3,3,find the female actress from the movie she's t...,O O O O O O O B_movie I_movie I_movie I_movie
4,4,who played dory on finding nemo,O O B_char O B_movie I_movie


In [11]:
data.columns = ['ID', 'input', 'labels']
data_test.columns = ['ID', 'input']

In [12]:
data.iloc[0]

ID                                                   0
input             who plays luke on star wars new hope
labels    O O B_char O B_movie I_movie I_movie I_movie
Name: 0, dtype: object

In [13]:
import re
import string

#preprocessing
def clean_text(df):
    output_list = []
    for i in range(df.shape[0]):
      text_list = df['input'][i].split(' ')
      for j in range(len(text_list)):
        word = text_list[j]
        if word in string.punctuation and len(word)==1:
          text_list[j] = '[PAD]'

        else:
          word = re.sub('[%s]' % re.escape(string.punctuation), '', word)
          text_list[j] = word
      output_list.append(" ".join(text_list))
    return output_list

In [14]:
data['input'] = clean_text(data)
data_test['input'] = clean_text(data_test)

In [15]:
data.head()
data_test.tail()

Unnamed: 0,ID,input
976,976,trailer for star wars a new hope
977,977,show resident evil movies with trailers
978,978,can i see previews for upcoming warner brother...
979,979,how many woody allen movies are set in new yor...
980,980,how many scorsese films were filmed in france


In [16]:
data['labels'] = data['labels'].str.replace('O¬¥O', 'O') #idk why this got here

labels = [x.split() for x in data['labels'].values.tolist()]
unique_labels = set([item for sublist in labels for item in sublist])

print(unique_labels)
print(len(unique_labels))


{'B_subject', 'I_director', 'I_producer', 'B_mpaa_rating', 'B_language', 'B_director', 'I-movie', 'I_language', 'I_person', 'B_location', 'B_movie', 'I_movie', 'B_char', 'B_release_year', 'I_subject', 'B_genre', 'I_mpaa_rating', 'B_person', 'I_genre', 'B_producer', 'I_release_year', 'I_char', 'B_cast', 'I_cast', 'I_country', 'O', 'B_country'}
27


In [17]:
for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
label_to_index = {k: v for v, k in enumerate(unique_labels)}
index_to_label = {v: k for v, k in enumerate(unique_labels)}

data_train, data_val, data_test = np.split(data.sample(frac=1, random_state=69), [int(.8 * len(data)), int(.9 * len(data))])

# data_train = data   # to train on entire dataset

In [18]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 6.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 62.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [19]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
import torch

# no predictions for broken word pieces from bert tokenization
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(label_to_index[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(-100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['input'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [21]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [24]:
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.optim import SGD
from tqdm import tqdm

BATCH_SIZE = 2
LEARNING_RATE = 10e-3
EPOCHS = 10


def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
        

In [25]:
model = BertModel()
train_loop(model, data_train, data_val)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  0.560 | Accuracy:  0.838 | Val_Loss:  0.238 | Accuracy:  0.914


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 2 | Loss:  0.193 | Accuracy:  0.943 | Val_Loss:  0.144 | Accuracy:  0.951


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 3 | Loss:  0.104 | Accuracy:  0.969 | Val_Loss:  0.136 | Accuracy:  0.950


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 4 | Loss:  0.066 | Accuracy:  0.981 | Val_Loss:  0.118 | Accuracy:  0.961


100%|██████████| 925/925 [02:57<00:00,  5.22it/s]


Epochs: 5 | Loss:  0.042 | Accuracy:  0.989 | Val_Loss:  0.119 | Accuracy:  0.968


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 6 | Loss:  0.031 | Accuracy:  0.991 | Val_Loss:  0.151 | Accuracy:  0.959


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 7 | Loss:  0.027 | Accuracy:  0.993 | Val_Loss:  0.126 | Accuracy:  0.956


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 8 | Loss:  0.040 | Accuracy:  0.989 | Val_Loss:  0.124 | Accuracy:  0.966


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 9 | Loss:  0.026 | Accuracy:  0.992 | Val_Loss:  0.144 | Accuracy:  0.957


100%|██████████| 925/925 [02:57<00:00,  5.21it/s]


Epochs: 10 | Loss:  0.028 | Accuracy:  0.993 | Val_Loss:  0.146 | Accuracy:  0.965


In [27]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=f592d2c570b6ba95bd4f747f8bd01e470f7996808900551a23dbce124071a05a
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [30]:
from seqeval.metrics import classification_report

# evaluate model 
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0
    all_predictions = []
    all_labels = []
    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              all_predictions.append(predictions)
              all_labels.append(label_clean)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')
    # print(classification_report(all_labels, all_predictions))
    return all_predictions, all_labels

In [31]:
val_predictions, val_labels = evaluate(model, data_test)

Test Accuracy:  0.972


In [32]:
word_ids_output = []

def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()
    word_ids_output = word_ids

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(-100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [index_to_label[i] for i in predictions]
    # print(sentence)
    # print(prediction_label)
    return prediction_label

In [37]:
test_text = data_test['input']

In [38]:
def get_prediction_labels(model, all_text):
    predictions = []
    for text in all_text:
      pred = evaluate_one_text(model, text)
      predictions.append(pred)
    return predictions

In [39]:
test_preds = get_prediction_labels(model, test_text)

In [40]:
sub_bert = pd.DataFrame({'ID': data_test['ID'], 'IOB Slot tags': test_preds})
sub_bert['IOB Slot tags'] = sub_bert['IOB Slot tags'].apply(lambda x: " ".join(x))

In [41]:
sub_bert.to_csv(path + "submission_bert7.csv", index=False)