In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [1]:
!pip install -qq transformers

[K     |████████████████████████████████| 4.0 MB 5.6 MB/s 
[K     |████████████████████████████████| 77 kB 3.4 MB/s 
[K     |████████████████████████████████| 596 kB 40.2 MB/s 
[K     |████████████████████████████████| 895 kB 42.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 34.7 MB/s 
[?25h

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import optim, nn
import numpy as np
import transformers
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

In [None]:
# imports the torch_xla package
import torch_xla
import torch_xla.core.xla_model as xm

In [None]:
device = xm.xla_device() #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
def change_target(target):
    if target == -1:
        return 0
    
    else:
        return 1

In [None]:
train = pd.read_csv('/content/gdrive/Shareddrives/CS 5804: Artificial Intelligence/train.csv', index_col=False)
val = pd.read_csv('/content/gdrive/Shareddrives/CS 5804: Artificial Intelligence/val.csv', index_col=False)
test = pd.read_csv('/content/gdrive/Shareddrives/CS 5804: Artificial Intelligence/test.csv', index_col=False)

train['target'] = train['target'].apply(change_target)
val['target'] = val['target'].apply(change_target)
test['target'] = test['target'].apply(change_target)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(train.head(250000), test_size=0.2, random_state=24)
val, test = train_test_split(test, test_size=0.5, random_state=24)

## Model Selection


In [None]:
model = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertModel.from_pretrained(model)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Dataset Preprocessing

In [None]:
class Sentiment140(Dataset):
  def __init__(self, texts, targets, tokenizer, max_length):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_length = max_length
    print(self.max_length)
  # Create these 2 methods in order to use Python's list indexer operator
  def __getitem__(self, curr_item):
    text = str(self.texts[curr_item])
    target = self.targets[curr_item] 
    encode = self.tokenizer.encode_plus(text, add_special_tokens=True, max_length=self.max_length,  pad_to_max_length=True, truncation=True, return_token_type_ids=False, return_attention_mask=True, return_tensors='pt')

    return {'tweets': text, 'input_ids': encode['input_ids'].flatten(), 'attention_mask': encode['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long)}

  def __len__(self):
    return len(self.texts)

In [None]:
def load_data(df, batch_sz, max_length, tokenizer):
    sent_data = Sentiment140(texts=df.text.to_numpy(), targets=df.target.to_numpy(), tokenizer=tokenizer, max_length=max_length)
    return DataLoader(sent_data, batch_size=batch_sz, num_workers=2)

In [None]:
batch_sz = 32
max_length = 80
train_dataloader = load_data(train, batch_sz, max_length, tokenizer)
val_dataloader = load_data(val, batch_sz, max_length, tokenizer)
test_dataloader = load_data(test, batch_sz, max_length, tokenizer)
iter_train = next(iter(train_dataloader))
print(iter_train)

In [None]:
# dropout for regularization and fully-connected layer for output
class Transformer(nn.Module):
    def __init__(self, num_classes):
        super(Transformer, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(p=0.3)

        # get hidden units for each token and set to num_classes
        self.output = nn.Linear(self.model.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        results = self.model(input_ids=input_ids, attention_mask=attention_mask)
        result = self.dropout(results.pooler_output)
        return self.output(result)


In [None]:
model = Transformer(num_classes=2)
model = model.to(device)
attention_mask = iter_train['attention_mask'].to(device)
input_ids = iter_train['input_ids'].to(device)

# make sure the shape is batch size and max length
assert(attention_mask.shape[0] == batch_sz and attention_mask.shape[1] == max_length)
assert(input_ids.shape[0] == batch_sz and input_ids.shape[1] == max_length)
print(len(train_dataloader))

In [None]:
# params to try to replicate BERT paper 
optim = AdamW(model.parameters(), lr=2e-5)
epochs = 10
num_steps = len(train_dataloader) * epochs

loss_function = nn.CrossEntropyLoss().to(device)

scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps=0, num_training_steps=num_steps)

## Model Training


In [None]:
def training(model, data, optimizer, scheduler):
    model = model.train()
    correct = 0.0
    total_loss = []
    count = 0
    for curr in data:

        labels = curr['targets'].to(device)
        input_ids = curr['input_ids'].to(device)
        attention_mask = curr['attention_mask'].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        
        #get argmax
        _, arg_pred = torch.max(output, dim=1)
        loss = loss_function(output, labels)
        correct += torch.sum(arg_pred == labels)
        total_loss.append(loss.item())
        loss.backward()

        # employ gradient clipping to avoid exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optim.step()
        scheduler.step()
        optim.zero_grad()

        if count % 1000 == 0:
            print(count)
        count += 1
        print(count)


    return np.mean(total_loss), correct / len(data)      

In [None]:
def validation(model, data, optimizer, scheduler):
    model = model.eval()
    correct = 0.0
    total_loss = []

    with torch.no_grad():

        for curr in data:
            attention_mask = curr['attention_mask'].to(device)
            labels = curr['targets'].to(device)
            input_ids = curr['input_ids'].to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            value_pred, arg_pred = torch.max(output, dim=1)
            correct += torch.sum(arg_pred == labels)
            loss = loss_function(output, labels)
            total_loss.append(loss.item())

    return np.mean(total_loss), correct / len(data)

In [None]:
from tqdm import trange

In [None]:
max_acc = 0.0

for epoch in trange(epochs):
    print('Epoch: {},'.format(epoch+1))

    training_loss, training_acc = training(model, train_dataloader, optim, scheduler)

    print('Training Loss: {}, Training Accuracy: {}'.format(training_loss, training_acc))

    val_loss, val_acc = validation(model, val_dataloader, optim, scheduler)

    print('Validation Loss: {}, Validation Accuracy: {}'.format(val_loss, val_acc), '\n')

    if val_acc > max_acc:
        torch.save(model.state_dict(), 'SentimentAnalysis.bin')
        max_acc = val_acc

In [5]:
import matplotlib
%matplotlib
!python autograder.py

Using matplotlib backend: agg

Question q1
*** q1) check_perceptron
Sanity checking perceptron...
  expected_prediction = np.asscalar(np.where(np.dot(point, p.get_weights().data.T) >= 0, 1, -1))
Sanity checking perceptron weight updates...
Sanity checking complete. Now training perceptron
<Figure size 640x480 with 1 Axes>
*** PASS: check_perceptron

### Question q1: 8/8 ###

Question q2
*** q2) check_regression
<Figure size 640x480 with 1 Axes>
Your final loss is: 0.001795
*** PASS: check_regression

### Question q2: 8/8 ###

Question q3
*** q3) check_digit_classification
<Figure size 640x480 with 10 Axes>


Caught KeyboardInterrupt: aborting autograder

Finished at 18:01:29

Provisional grades
Question q1: 8/8
Question q2: 8/8
Question q3: 3/9
------------------
Total: 19/25

Your grades are NOT yet registered.  To register your grades, make sure
to follow your instructor's guidelines to receive credit on your project.


[autograder was interrupted before finishing]
