<a href="https://colab.research.google.com/github/ttb-folio/Pitchfork-Reviews/blob/main/SentimentAnalysisClassifierGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install transformers
# !git clone https://github.com/ttb-folio/Pitchfork-Reviews.git

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments, Trainer, AdamW

In [2]:

content_df = pd.read_csv('Data/content.csv', nrows = 10)
review_df = pd.read_csv('Data/reviews.csv', nrows = 10)
reviews = content_df.content.fillna(' ').values
labels = (review_df.score <= 6).astype(int).values

reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

In [3]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize(x_text):
    text = list(x_text)
    tokenizedText = tokenizer(text, padding = True, truncation = True)
    return tokenizedText

x_train = tokenize(reviews_train)
x_test = tokenize(reviews_test)

In [8]:
x_train2 = tokenizer(list(reviews_train), padding = True, truncation = True)

In [9]:
x_train2 == x_train

True

In [4]:
x_train

{'input_ids': [[101, 1999, 2047, 2259, 1521, 1055, 6388, 3496, 1010, 1996, 3739, 1997, 8032, 18904, 8322, 4845, 3286, 5638, 2006, 1037, 4164, 3021, 2038, 2042, 1037, 8335, 17245, 1997, 3737, 1012, 1999, 2286, 1010, 2016, 7622, 1999, 1037, 2444, 3311, 2007, 1996, 8987, 4543, 1998, 17727, 12298, 17288, 14163, 8865, 2957, 23063, 1012, 2012, 1996, 2297, 9809, 20313, 1010, 2016, 2001, 2112, 1997, 1037, 22446, 7241, 2008, 2435, 1996, 6765, 1997, 2028, 1997, 1996, 2345, 14281, 2011, 1996, 2397, 28036, 2728, 9321, 1012, 2127, 5716, 1996, 2177, 6422, 3924, 1010, 2295, 1010, 4845, 3286, 5638, 2018, 2664, 2000, 3443, 1037, 4316, 2005, 2014, 2219, 9265, 1012, 1996, 8530, 2008, 3248, 2006, 2014, 2316, 19738, 4667, 2834, 2950, 2070, 5220, 11725, 1012, 7101, 4098, 14855, 16020, 1521, 1055, 14855, 23200, 2373, 2038, 2042, 2657, 1999, 1996, 2152, 1011, 11619, 3769, 1997, 1996, 2316, 5841, 1012, 10430, 19977, 4717, 5912, 1521, 1055, 2147, 2038, 4928, 4187, 2000, 5633, 2011, 8694, 1011, 4210, 1998, 2645,

In [4]:
class customDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self,idx):
    if torch.is_tensor(idx):
      idx.tolist()
    item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

data_train = customDataset(x_train, y_train)
data_test = customDataset(x_test, y_test)

In [5]:
batch_size = 32
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

args = TrainingArguments(
    "./train_models",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.05,
    logging_dir = './logs',
    logging_steps = batch_size
)

trainer = Trainer(
    model,
    args,
    train_dataset = data_train,
    eval_dataset = data_test,
    tokenizer = tokenizer
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.3962,0.381325,30.76,119.473
2,0.297,0.374561,30.7606,119.471
3,0.2515,0.389152,30.7653,119.453
4,0.1962,0.448878,30.7604,119.472


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.3962,0.381325,30.76,119.473
2,0.297,0.374561,30.7606,119.471
3,0.2515,0.389152,30.7653,119.453
4,0.1962,0.448878,30.7604,119.472
5,0.1552,0.505378,30.7542,119.496


TrainOutput(global_step=2300, training_loss=0.27035701109015425, metrics={'train_runtime': 2221.6749, 'train_samples_per_second': 1.035, 'total_flos': 1.51148203310592e+16, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1875173376, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 111755264, 'train_mem_gpu_alloc_delta': 816905728, 'train_mem_cpu_peaked_delta': 1867776, 'train_mem_gpu_peaked_delta': 13072364544})

In [7]:
predictions = trainer.predict(data_test)

In [8]:
from sklearn.metrics import confusion_matrix, f1_score

confusion_matrix(predictions[1],y_test)

array([[2979,    0],
       [   0,  696]])

In [None]:
Ketchupredictions[2]

{'test_loss': 0.35213690996170044,
 'test_mem_cpu_alloc_delta': 0,
 'test_mem_cpu_peaked_delta': 0,
 'test_mem_gpu_alloc_delta': 0,
 'test_mem_gpu_peaked_delta': 79995904,
 'test_runtime': 0.2568,
 'test_samples_per_second': 778.836}

In [None]:
predictions[1][20:30]

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0])

In [None]:
trainer.evaluate()

{'epoch': 10.0,
 'eval_loss': 0.35213690996170044,
 'eval_mem_cpu_alloc_delta': 4096,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 79992832,
 'eval_runtime': 0.1574,
 'eval_samples_per_second': 1270.961}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('train_models').to('cpu')
trainer = Trainer(
  model,
  TrainingArguments("poop"),
  train_dataset = data_train,
  eval_dataset = data_test,
  tokenizer = tokenizer
  )
outputs = model(torch.tensor(x_test['input_ids']))

'Baseline Sentiment Analysis.ipynb'   PitchforkReviewAnalysis.ipynb
 CurrentPF.ipynb		      Pitchfork-Reviews
 Data				      README.md
'Data Loading.ipynb'		      Reports
 EDA.ipynb			      runs
 logs				      train_models


In [None]:
from transformers import get_linear_schedule_with_warmup
def fit(model, trainDataset, valDataset, epochs = 1, batchSize = 32, lr = 1e-3, scheduler=None):
  device = xm.xla_device()
  model = model.to(device)
  
  trainSampler = torch.utils.data.distributed.DistributedSampler(
    trainDataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True)
  valSampler = torch.utils.data.distributed.DistributedSampler(
    valDataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=False)
  trainLoader = DataLoader(trainDataset, 
                           batch_size=batchSize, 
                           sampler=trainSampler, 
                           num_workers=0, 
                           drop_last = True)
  valLoader = DataLoader(valDataset, 
                         batch_size=batchSize, 
                         shuffle = False, 
                         sampler = valSampler,
                         num_workers=0,
                         drop_last=False)
  num_train_steps = int(len(trainDataset) / batchSize /xm.xrt_world_size()*epochs)
  optimizer = AdamW(model.parameters(), lr = lr)
  # scheduler = get_linear_schedule_with_warmup(
  #     optimizer, num_warmup_steps = 0, num_training_steps = num_train_steps
  # )
  # scheduler = torch.optim.lr_scheduler.StepLR(
  #     optimizer, step_size = 2, gamma = 0.1
  # )
  # scheduler = torch.optim.lr_scheduler.CyclicLR(
  #     optimizer,
  #     base_lr = 5e-5, max_lr = lr,
  #     mode = 'triangular2',
  #     step_size_up = int(len(trainDataset)/32/xm.xrt_world_size())*3, #3 being the number of epochs to go up.
  #     cycle_momentum = False
  # )
  def train_loop(trainLoader):
    optimizer.zero_grad()
  
    for batch in trainLoader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      loss.backward()
      xm.optimizer_step(optimizer, barrier = True)
      if scheduler is not None:
        scheduler.step()
    print(f'Final Training Loss: {loss}')
  def eval_loop(valLoader):
    with torch.no_grad():
      for batch in valLoader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
      print(f'Final Val Loss: {loss}')

  for epoch in range(epochs):

    model.train()
    para_loader = pl.ParallelLoader(trainLoader,[device])
    train_loop(para_loader.per_device_loader(device))
    # _lr = scheduler.get_last_lr()[0]
    # print(_lr)
    print(f'epoch number: {epoch}')
    del para_loader
    
    model.eval()
    para_loader = pl.ParallelLoader(valLoader, [device])
    eval_loop(para_loader.per_device_loader(device))
    del para_loader
  xm.save(model.state_dict(), "model.pt")


In [None]:
lr = 5e-4
def fit_multiprocessing(rank, flags):
  fit(model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint,num_labels=2), 
      trainDataset = trainDataset, valDataset = valDataset, epochs = 20,  batchSize = 32, lr = lr)

FLAGS = {}
xmp.spawn(fit_multiprocessing, args =(FLAGS,), nprocs=8, start_method='fork')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Final Training Loss: 1.3492283821105957
epoch number: 0
Final Training Loss: 1.2990268468856812
Final Training Loss: 1.7547035217285156
Final Training Loss: 1.068077802658081
Final Training Loss: 1.7174874544143677
epoch number: 0
epoch number: 0
Final Training Loss: 1.5977767705917358
epoch number: 0
Final Training Loss: 1.4827221632003784
Final Training Loss: 1.4326668977737427
epoch number: 0
epoch number: 0
epoch number: 0
epoch number: 0
Final Val Loss: 1.555977463722229
Final Val Loss: 1.5560131072998047
Final Val Loss: 1.440492868423462
Final Val Loss: 1.2094310522079468
Final Val Loss: 2.0180861949920654
Final Val Loss: 1.440464735031128
Final Val Loss: 1.9026119709014893
Final Val Loss: 2.133604049682617
Final Training Loss: 0.7239044308662415
Final Training Loss: 0.7833437919616699
epoch number: 1
Final Training Loss: 0.6569998860359192
Final Training Loss: 0.7507330179214478
epoch number: 1
Final Training Loss: 0.6701981425285339
Final Training Loss: 0.7880423069000244
Final

In [None]:
from transformers import DistilBertConfig

_model = DistilBertForSequenceClassification(
    config = DistilBertConfig(num_labels = 2))

checkpoint = torch.load('model.pt')
_model.load_state_dict(checkpoint)

_model.eval()

valLoader = DataLoader(valDataset, 
                        batch_size=32, 
                        shuffle = False, 
                        num_workers=0,
                        drop_last=True)
with torch.no_grad():
  for batch in valLoader:
    input_ids = batch['input_ids']

    outputs = _model(input_ids)
    break

  print(outputs[0])

tensor([[-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900],
        [-0.3900,  0.3900]])
