In [1]:
!pip install gensim==3.8.3 --quiet
!pip install pydot --quiet
!pip install sentencepiece --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 24.2 MB 1.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.0 MB/s 
[K     |████████████████████████████████| 4.4 MB 14.7 MB/s 
[K     |████████████████████████████████| 596 kB 76.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 85.9 MB/s 
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
[?25h

In [15]:
from google.cloud import storage
import google.oauth2.credentials
import json
import seaborn as sns

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
import torch

import sklearn as sk
import nltk
from nltk.corpus import reuters
from nltk.data import find

import matplotlib.pyplot as plt

import re

#This continues to work with gensim 3.8.3.  It doesn't yet work with 4.x.  
#Make sure your pip install command specifies gensim==3.8.3
import gensim

from transformers import BertTokenizer, TFBertModel, XLNetTokenizer, TFXLNetForSequenceClassification, TFBertForSequenceClassification, TFRobertaForSequenceClassification, RobertaTokenizer


from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split



## Data loading

In [3]:
# Added this so I can just put the data files in my google drive and access them from there, if you have a preferred way of storing the data feel free to change
# You can download the data here: 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_train.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_test.csv")
valid = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_valid.csv")

In [7]:
x_train = list(train[['text']].text)
y_train = np.asarray(train[['label']].label)

x_test = list(test[['text']].text)
y_test = np.asarray(test[['label']].label)

x_valid = list(valid[['text']].text)
y_valid = np.asarray(valid[['label']].label)

## Roberta

In [19]:
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [24]:
def create_roberta_model():
    """Create a roBERTa model using the model and parameters specified in the roBERTa paper:
    https://arxiv.org/pdf/1907.11692.pdf 

        - model: TFRobertaForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in roberta_model.weights[:-4]]
    trainable = [w.name for w in roberta_model.weights[-4:]]

    for w in roberta_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    roberta_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return roberta_model

In [77]:
def tokenize(length, data, tokenizer):
  """Tokenize text using specified tokenizer with the constraint of max_length
    - length: max length of tokenized output
    - data: text to tokenize
    - tokenizer
  """
  encodings = tokenizer(data, 
                max_length=length,
                truncation=True,
                padding='max_length', 
                return_tensors='tf')
  return encodings

def run_roberta(length, tokenizer):
  """ Tokenizes, trains and evaluates roBERTa models for different max_lengths
  """

  print(f'Running roBERTa for encoding max_length: {length}')
  print('Tokenizing data...')
  train_encodings_roberta = tokenize(length, x_train, tokenizer)
  valid_encodings_roberta = tokenize(length, x_valid, tokenizer)
  test_encodings_roberta = tokenize(length, x_test, tokenizer)

  print(f'Created encoding for training data with shape {train_encodings_roberta.input_ids.shape}')
  print(f'Created encoding for validation data with shape {valid_encodings_roberta.input_ids.shape}')
  print(f'Created encoding for test data with shape {test_encodings_roberta.input_ids.shape}')

  model = create_roberta_model()
  print('Training model...')
  history = model.fit(
    [train_encodings_roberta.input_ids, train_encodings_roberta.attention_mask], 
    y_train,
    validation_data=(
        [valid_encodings_roberta.input_ids, valid_encodings_roberta.attention_mask], 
        y_valid
        ),
    batch_size=32, 
    epochs=4
  )

  print('Evaluating model...')
  score = model.evaluate([test_encodings_roberta.input_ids, test_encodings_roberta.attention_mask], y_test)

  print("Test loss:", score[0])
  print("Test accuracy:", score[1])

  predictions = model.predict([test_encodings_roberta.input_ids, test_encodings_roberta.attention_mask])
  preds = predictions.to_tuple()[0].argmax(1)
  print('\n Classification Report:\n')
  print(classification_report(y_test, preds))

  model.save(
    str.format("/content/drive/My Drive/models/Project W266/roberta_model_{length}", length=length),
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None,
    save_traces=True
  )


In [78]:
max_lengths = [64,128,256,320,384,448,512]
for length in max_lengths:
  run_roberta(length, roberta_tokenizer)

Running roBERTa for encoding max_length: 64
Tokenizing data...
Created encoding for training data with shape (47146, 64)
Created encoding for validation data with shape (5893, 64)
Created encoding for test data with shape (5894, 64)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.579146146774292
Test accuracy: 0.7127587199211121

 Classification Report:

              precision    recall  f1-score   support

           0       0.73      0.70      0.71      2992
           1       0.70      0.73      0.71      2902

    accuracy                           0.71      5894
   macro avg       0.71      0.71      0.71      5894
weighted avg       0.71      0.71      0.71      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_64/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_64/assets


Running roBERTa for encoding max_length: 128
Tokenizing data...
Created encoding for training data with shape (47146, 128)
Created encoding for validation data with shape (5893, 128)
Created encoding for test data with shape (5894, 128)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.5463767051696777
Test accuracy: 0.7465218901634216

 Classification Report:

              precision    recall  f1-score   support

           0       0.73      0.80      0.76      2992
           1       0.77      0.70      0.73      2902

    accuracy                           0.75      5894
   macro avg       0.75      0.75      0.75      5894
weighted avg       0.75      0.75      0.75      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_128/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_128/assets


Running roBERTa for encoding max_length: 256
Tokenizing data...
Created encoding for training data with shape (47146, 256)
Created encoding for validation data with shape (5893, 256)
Created encoding for test data with shape (5894, 256)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.5286672711372375
Test accuracy: 0.7677298784255981

 Classification Report:

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      2992
           1       0.76      0.78      0.77      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_256/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_256/assets


Running roBERTa for encoding max_length: 320
Tokenizing data...
Created encoding for training data with shape (47146, 320)
Created encoding for validation data with shape (5893, 320)
Created encoding for test data with shape (5894, 320)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.5211576223373413
Test accuracy: 0.7702748775482178

 Classification Report:

              precision    recall  f1-score   support

           0       0.76      0.79      0.78      2992
           1       0.78      0.75      0.76      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_320/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_320/assets


Running roBERTa for encoding max_length: 384
Tokenizing data...
Created encoding for training data with shape (47146, 384)
Created encoding for validation data with shape (5893, 384)
Created encoding for test data with shape (5894, 384)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.521388590335846
Test accuracy: 0.7714625000953674

 Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.78      0.78      2992
           1       0.77      0.76      0.77      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_384/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_384/assets


Running roBERTa for encoding max_length: 448
Tokenizing data...
Created encoding for training data with shape (47146, 448)
Created encoding for validation data with shape (5893, 448)
Created encoding for test data with shape (5894, 448)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.5192633867263794
Test accuracy: 0.7706142067909241

 Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.79      0.78      2992
           1       0.77      0.75      0.76      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_448/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_448/assets


Running roBERTa for encoding max_length: 512
Tokenizing data...
Created encoding for training data with shape (47146, 512)
Created encoding for validation data with shape (5893, 512)
Created encoding for test data with shape (5894, 512)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...
Epoch 1/4








Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluating model...
Test loss: 0.5182607769966125
Test accuracy: 0.7709535360336304

 Classification Report:

              precision    recall  f1-score   support

           0       0.76      0.81      0.78      2992
           1       0.79      0.73      0.76      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894





INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_512/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/models/Project W266/roberta_model_512/assets
