<a href="https://colab.research.google.com/github/ttb-folio/Pitchfork-Reviews/blob/main/CurrentPF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

!pip install efficientnet_pytorch > /dev/null
!pip install albumentations > /dev/null
!pip install transformers

!git clone https://github.com/ttb-folio/Pitchfork-Reviews.git

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5116  100  5116    0     0   111k      0 --:--:-- --:--:-- --:--:--  111k
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200515 ...
Uninstalling torch-1.6.0a0+bf2bbd9:
  Successfully uninstalled torch-1.6.0a0+bf2bbd9
Uninstalling torchvision-0.7.0a0+a6073f0:
  Successfully uninstalled torchvision-0.7.0a0+a6073f0
Copying gs://tpu-pytorch/wheels/torch-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
| [1 files][ 91.0 MiB/ 91.0 MiB]                                                
Operation completed over 1 objects/91.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
\ [1 files][119.5 MiB/119.5 MiB]                                      

In [11]:
import numpy as np
import pandas as pd
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
from transformers import AutoTokenizer, set_seed, DistilBertForSequenceClassification, AdamW
from tqdm import tqdm
import os
import warnings

warnings.filterwarnings("ignore")
assert os.environ['COLAB_TPU_ADDR']

In [12]:
content = pd.read_csv('Pitchfork-Reviews/Data/content.csv')
reviews = pd.read_csv('Pitchfork-Reviews/Data/reviews.csv')

In [13]:
#Sort by publish date so train, val, and test can be split in a way that maintains chronological order.
reviewIdSorted = reviews.sort_values(by = 'pub_date').reset_index()['reviewid']

#70/15/15 split for train/val/test
numberIds = len(reviewIdSorted)
valCutOff = numberIds//10*7
testCutOff = valCutOff + numberIds//20*3
trainIds = reviewIdSorted[0:valCutOff]
valIds = reviewIdSorted[valCutOff:testCutOff]
testIds = reviewIdSorted[testCutOff:]

In [14]:
#Join content and reviews dataframe on review id, keeping content (raw text of review) and score 
contentScorePairs = pd.merge(
    content[['reviewid', 'content']],
    reviews[['reviewid', 'score']],
    on = 'reviewid')

#Remove items with no written review
contentScorePairs = contentScorePairs[~contentScorePairs['content'].isna()]

trainContentScorePairs = contentScorePairs[contentScorePairs['reviewid'].isin(trainIds)]
valContentScorePairs = contentScorePairs[contentScorePairs['reviewid'].isin(valIds)]
testContentScorePairs = contentScorePairs[contentScorePairs['reviewid'].isin(testIds)]

In [15]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize(contentScorePair):
    text = list(contentScorePair['content'])
    tokenizedText = tokenizer(text, padding = True, truncation = True)
    return tokenizedText

xTrain = tokenize(trainContentScorePairs)
xVal = tokenize(valContentScorePairs)
xTest = tokenize(testContentScorePairs)

In [16]:
#Convert review score (with range 0.0 - 10.0) into classes.
#I choose to set the thresholds to be defined by the quantiles in the training set instead of even spacing.
#Fewer than 25% of scores are less than 6.3, so it would not be very useful to distguish between below 2.5 and between 2.5 and 5. 
#Distinguishing between 6.3 to 7.2 and 7.2 to 7.9 is much more interesting.
#I only use the training set because we don't know the true distribution of scores to be predicted.

# vLowScoreLimit = trainContentScorePairs.score.quantile(.25) #6.3
# lowScoreLimit = trainContentScorePairs.score.quantile(.5) #7.2
# highScoreLimit = trainContentScorePairs.score.quantile(.75) #7.9
# vHighScoreLimit = trainContentScorePairs.score.quantile(1) #10.0

# def convertScore(score):
#     if score <= vLowScoreLimit:
#         y = 0
#     elif score <= lowScoreLimit:
#         y = 1
#     elif score <= highScoreLimit:
#         y = 2
#     elif score <= vHighScoreLimit:
#         y = 3
#     else:
#         y = 100000
#     return y

def convertScore(score):
    if score <= 6: #trainContentScorePairs.score.quantile(.5):
      y = 0
    else:
      y = 1
    return y

trainContentScorePairs['target'] = trainContentScorePairs['score'].transform(lambda x: convertScore(x))
valContentScorePairs['target'] = valContentScorePairs['score'].transform(lambda x: convertScore(x))
testContentScorePairs['target'] = testContentScorePairs['score'].transform(lambda x: convertScore(x))

tTrain = list(trainContentScorePairs['target'].values)
tVal = list(valContentScorePairs['target'].values)
tTest = list(testContentScorePairs['target'].values)

In [17]:
#Combine x and t values to make Datasets, for easy input into Torch DataLoader

class reviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

trainDataset = reviewDataset(xTrain, tTrain)
valDataset = reviewDataset(xVal, tVal)
testDataset = reviewDataset(xTest, tTest)

In [28]:
from transformers import get_linear_schedule_with_warmup
def fit(model, trainDataset, valDataset, epochs = 1, batchSize = 32, lr = 1e-3, scheduler=None):
  device = xm.xla_device()
  model = model.to(device)
  
  trainSampler = torch.utils.data.distributed.DistributedSampler(
    trainDataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True)
  trainLoader = DataLoader(trainDataset, 
                           batch_size=batchSize, 
                           sampler=trainSampler, 
                           num_workers=0, 
                           drop_last = True)
  valLoader = DataLoader(valDataset, 
                         batch_size=batchSize, 
                         shuffle = False, 
                         num_workers=0,
                         drop_last=True)
  num_train_steps = int(len(trainDataset) / batchSize /xm.xrt_world_size()*epochs)
  optimizer = AdamW(model.parameters(), lr = lr)
  scheduler = get_linear_schedule_with_warmup(
      optimizer, num_warmup_steps = 0, num_training_steps = num_train_steps
  )
  def train_loop(trainLoader):
    optimizer.zero_grad()
  
    for batch in trainLoader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      loss.backward()
      xm.optimizer_step(optimizer, barrier = True)
      if scheduler is not None:
        scheduler.step()
    print(f'Final Training Loss: {loss}')
  def eval_loop(valLoader):
    with torch.no_grad():
      for batch in valLoader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
      print(f'Final Val Loss: {loss}')

  for epoch in range(epochs):
    print(lr)
    model.train()
    para_loader = pl.ParallelLoader(trainLoader,[device])
    train_loop(para_loader.per_device_loader(device))
    del para_loader
    
    model.eval()
    para_loader = pl.ParallelLoader(valLoader, [device])
    eval_loop(para_loader.per_device_loader(device))
    del para_loader
    xm.save(model.state_dict(), "model.pt")


In [29]:
lr = 5e-3
def fit_multiprocessing(rank, flags):
  fit(model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint,num_labels=2), 
      trainDataset = trainDataset, valDataset = valDataset, epochs = 5,  batchSize = 32, lr = lr)

FLAGS = {}
xmp.spawn(fit_multiprocessing, args =(FLAGS,), nprocs=8, start_method='fork')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

0.005
0.005
0.005
0.005
0.005
0.005
0.005
0.005
Final Training Loss: 4.011714458465576
Final Training Loss: 2.386448860168457
Final Training Loss: 5.173232078552246
Final Training Loss: 4.90132474899292
Final Training Loss: 4.715847969055176
Final Training Loss: 4.835206031799316
Final Training Loss: 6.707940578460693
Final Training Loss: 5.3618669509887695
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
Final Val Loss: 1.1402983665466309
0.005
0.005
0.005
0.005
0.005
0.005
0.005
0.005
Final Training Loss: 6.347614288330078
Final Training Loss: 9.70417594909668
Final Training Loss: 8.661746978759766
Final Training Loss: 7.475641250610352
Final Training Loss: 6.312149524688721
Final Training Loss: 6.382071495056152
Final Training Loss: 4.2785234451293945
Final Training Loss: 3.2049314975738525

In [23]:
from transformers import DistilBertConfig

_model = DistilBertForSequenceClassification(
    config = DistilBertConfig(num_labels = 2))

checkpoint = torch.load('model.pt')
_model.load_state_dict(checkpoint)

_model.eval()

valLoader = DataLoader(valDataset, 
                        batch_size=32, 
                        shuffle = False, 
                        num_workers=0,
                        drop_last=True)
with torch.no_grad():
  for batch in valLoader:
    input_ids = batch['input_ids']

    outputs = _model(input_ids)
    break

  print(outputs[0])

tensor([[-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256],
        [-0.7256,  0.7256]])
