# Setup

In [1]:
# mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd "/content/gdrive/My Drive/Github/SubjectIndexing"

/content/gdrive/My Drive/Github/SubjectIndexing


In [3]:
# import libraries
import time
import pickle
import numpy as np
import pandas as pd

!pip install transformers
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 8.7 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 8.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 63.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 62.1 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
  

In [4]:
# set parameters
BATCH_SIZE = 2

In [5]:
# save pickle files
def save_pickle(stuff, fileName):
    with open(fileName, 'wb') as f:
        pickle.dump(stuff, f, pickle.HIGHEST_PROTOCOL)

# load pickle files
def load_pickle(fileName):
    with open(fileName, 'rb') as f:
        return pickle.load(f)

In [6]:
#-----------------------------------------------------------------
#  Class GutenbergDataset
#-----------------------------------------------------------------

class GutenbergDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.as_tensor(self.labels[idx], dtype=torch.int64)
        return item

    def __len__(self):
        return len(self.labels)

#-----------------------------------------------------------------
#  End of Class GutenbergDataset
#-----------------------------------------------------------------

In [7]:
#-----------------------------------------------------------------
#  Class LongformerClassification
#-----------------------------------------------------------------

class LongformerClassification:

    def __init__(self, tokenizer='allenai/longformer-base-4096', model='allenai/longformer-base-4096', num_labels=19):
        
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.tokenizer = LongformerTokenizer.from_pretrained(tokenizer)
        self.model = LongformerForSequenceClassification.from_pretrained(model, num_labels=num_labels, output_hidden_states=False).to(self.device)

#-----------------------------------------------------------------

    def train(self, train_loader, val_loader, max_epoch=3):

        optim = AdamW(self.model.parameters(), lr=5e-5)
        start = time.time()
        mid_prev = start
        for epoch in range(max_epoch):
            
            print("--------------------")
            print("epoch " + str(epoch))
            train_loss = 0
            val_loss = 0
            train_acc = 0
            val_acc = 0

            # train set
            self.model.train()
            for batch in train_loader:
                optim.zero_grad()
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs[0]
                loss.backward()
                optim.step()
                
                # record train loss and acc
                train_loss += loss.data.item()
                true = labels.tolist()
                pred = outputs.logits.argmax(-1).tolist()
                train_acc += accuracy_score(true, pred, normalize=False)

            print(
                "Train loss:", round(train_loss/len(train_loader), 4), "   ", 
                "Train acc:", round(train_acc/len(train_loader), 4)
            )

            # save model
            modelName = "./models/longformer-class-2048-" + str(round(time.time()))
            self.model.save_pretrained(modelName)
            print("Model " + str(round(time.time())) + " Saved!")

            # validation set
            self.model.eval()
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)
                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs[0]

                    # record val loss and acc
                    val_loss += loss.data.item()
                    true = labels.tolist()
                    pred = outputs.logits.argmax(-1).tolist()
                    val_acc += accuracy_score(true, pred, normalize=False)
            
            # record epoch runtime 
            mid_curr = time.time()
            print(
                "Val loss:", round(val_loss/len(val_loader), 4), "   ",
                "Val acc:", round(val_acc/len(val_loader), 4)
            )
            self._print_time("Runtime:", mid_prev, mid_curr)
            mid_prev = mid_curr

        print("--------------------")
        end = time.time()
        self._print_time("Total Runtime", start, end)

#-----------------------------------------------------------------

    def predict(self, test_loader):

        test_loss = 0
        test_acc = 0

        self.model.eval()
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                
                # record test loss and acc
                test_loss += outputs[0]
                true = labels.tolist()
                pred = outputs.logits.argmax(-1).tolist()
                test_acc += accuracy_score(true, pred, normalize=False)

        print(
            "Test loss:", round(test_loss/len(test_loader), 4), "|", 
            "Test acc:", round(test_acc/len(test_loader), 4), 
        )

#-----------------------------------------------------------------

    def _print_time(self, tag, start, end):
        print(tag, round((end-start)//3600), "hr", round(((end-start)%3600)//60), "min",  round((end-start)%60), "sec")     

#-----------------------------------------------------------------
#  End of Class LongformerClassification
#-----------------------------------------------------------------

In [8]:
# import data
#train_set = pd.read_json('./data/train_set.json')
#val_set = pd.read_json('./data/val_set.json')
#test_set = pd.read_json('./data/test_set.json')

# transform text to encodings
#train_encodings = tokenizer(list(train_set.X), max_length=2048, truncation=True, padding=True)
#val_encodings = tokenizer(list(val_set.X), max_length=2048, truncation=True, padding=True)
#test_encodings = tokenizer(list(test_set.X), max_length=2048, truncation=True, padding=True)

# saved encodings and labels 
#save_pickle(train_encodings, 'train_encodings.pkl')
#save_pickle(val_encodings, 'val_encodings.pkl')
#save_pickle(test_encodings, 'test_encodings.pkl')
#save_pickle(train_set.y_class, 'train_labels.pkl')
#save_pickle(val_set.y_class, 'val_labels.pkl')
#save_pickle(test_set.y_class, 'test_labels.pkl')

In [9]:
# import encodings (X)
train_encodings = load_pickle('train_encodings.pkl')
val_encodings = load_pickle('val_encodings.pkl')
test_encodings = load_pickle('test_encodings.pkl')

# import labels (y)
train_labels = load_pickle('train_labels.pkl')
val_labels = load_pickle('val_labels.pkl')
test_labels = load_pickle('test_labels.pkl')

# create numerical index 
class2label = {cls:i for i, cls in enumerate(sorted(list(set(train_labels))))}
label2class = {v:k for k,v in class2label.items()}

In [10]:
# build custom datasets
train_dataset = GutenbergDataset(train_encodings, [class2label[cls] for cls in train_labels])
val_dataset = GutenbergDataset(val_encodings, [class2label[cls] for cls in val_labels])
test_dataset = GutenbergDataset(test_encodings, [class2label[cls] for cls in test_labels])

# build data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
# execute main
clf = LongformerClassification(tokenizer='allenai/longformer-base-4096', model='allenai/longformer-base-4096', num_labels=19)
clf.train(train_loader, val_loader, max_epoch=3)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weigh

--------------------
epoch 0


KeyboardInterrupt: ignored

In [None]:
clf.predict(test_loader)