In [2]:
!pip install gensim==3.8.3 --quiet
!pip install pydot --quiet
!pip install sentencepiece --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 24.2 MB 683 kB/s 
[K     |████████████████████████████████| 1.2 MB 12.4 MB/s 
[K     |████████████████████████████████| 4.4 MB 13.2 MB/s 
[K     |████████████████████████████████| 101 kB 10.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 60.5 MB/s 
[K     |████████████████████████████████| 596 kB 62.5 MB/s 
[?25h

In [3]:
from google.cloud import storage
import google.oauth2.credentials
import json
import seaborn as sns

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences
import torch

import sklearn as sk
import nltk
from nltk.corpus import reuters
from nltk.data import find

import matplotlib.pyplot as plt

import re

#This continues to work with gensim 3.8.3.  It doesn't yet work with 4.x.  
#Make sure your pip install command specifies gensim==3.8.3
import gensim

from transformers import BertTokenizer, TFBertModel, XLNetTokenizer, TFXLNetForSequenceClassification, TFBertForSequenceClassification, TFRobertaForSequenceClassification, RobertaTokenizer


from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split



## Data loading

In [4]:
# Added this so I can just put the data files in my google drive and access them from there, if you have a preferred way of storing the data feel free to change
# You can download the data here: 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import pandas as pd
train_bilal = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/Paper_Data/train.csv", encoding='latin-1')
test_bilal = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/Paper_Data/test.csv", encoding='latin-1')

In [6]:
x_train_bilal = list(train_bilal[['sentence']].sentence)
y_train_bilal = np.asarray(train_bilal[['label']].label)

x_test_bilal = list(test_bilal[['sentence']].sentence)
y_test_bilal = np.asarray(test_bilal[['label']].label)

In [7]:
x_train_bilal, x_valid_bilal, y_train_bilal, y_valid_bilal = train_test_split(x_train_bilal, y_train_bilal, train_size=0.9)

## BERT

In [5]:
model_checkpoint = 'bert-base-uncased'
bert_uctokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertForSequenceClassification.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
max_length = 320
train_encodings_bilal = bert_uctokenizer(x_train_bilal, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

valid_encodings_bilal = bert_uctokenizer(x_valid_bilal, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

test_encodings_bilal = bert_uctokenizer(x_test_bilal, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

## Baseline model

In [6]:
def bilal_bert_model():
    """Create a BERT model using the model and parameters specified in the Bilal paper:
    https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2

        - model: TFBertForSequenceClassification
        - learning rate: 2e-5
        - epsilon: 1e-8
    """
    # Using the TFBertForSequenceClassification as specified in the paper:
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Freeze all layers except the last 4 which are the pooled classification layers:
    untrainable = [w.name for w in bert_model.weights[:-4]]
    trainable = [w.name for w in bert_model.weights[-4:]]

    for w in bert_model.weights:
        if w.name in untrainable:
            w._trainable = False
        elif w.name in trainable:
            w._trainable = True

    # Compile the model:
    bert_model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08),
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]
    )

    return bert_model

In [11]:
bilal_bert_model = bilal_bert_model()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
bilal_bert_model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Train the model using the specifications from the paper: https://link.springer.com/article/10.1007/s10660-022-09560-w/tables/2
# -- epochs = 4
# -- batch_size = 32

history = bilal_bert_model.fit(
    [train_encodings_bilal.input_ids, train_encodings_bilal.token_type_ids, train_encodings_bilal.attention_mask], 
    y_train_bilal,
    validation_data=(
        [valid_encodings_bilal.input_ids, valid_encodings_bilal.token_type_ids, valid_encodings_bilal.attention_mask], 
        y_valid_bilal
        ),
    batch_size=32, 
    epochs=4
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [14]:
score = bilal_bert_model.evaluate([test_encodings_bilal.input_ids, test_encodings_bilal.token_type_ids, test_encodings_bilal.attention_mask], y_test_bilal)

print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.5953600406646729
Test accuracy: 0.6980000138282776


In [15]:
predictions = bilal_bert_model.predict([test_encodings_bilal.input_ids, test_encodings_bilal.token_type_ids, test_encodings_bilal.attention_mask])
preds = predictions.to_tuple()[0].argmax(1)
print(classification_report(y_test_bilal, preds))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1000
           1       0.70      0.69      0.69      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000



## Control baseline with larger Yelp data set 

In [7]:
train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_train.csv")
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_test.csv")
valid = pd.read_csv("/content/drive/My Drive/Colab Notebooks/W266 Project/data/yelp_valid.csv")

In [8]:
x_train = list(train[['text']].text)
y_train = np.asarray(train[['label']].label)

x_test = list(test[['text']].text)
y_test = np.asarray(test[['label']].label)

x_valid = list(valid[['text']].text)
y_valid = np.asarray(valid[['label']].label)

In [9]:
max_length = 320
train_encodings = bert_uctokenizer(x_train, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

valid_encodings = bert_uctokenizer(x_valid, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

test_encodings = bert_uctokenizer(x_test, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

In [10]:
bilal_bert_model_expanded = bilal_bert_model()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
history = bilal_bert_model_expanded.fit(
    [train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
    y_train,
    validation_data=(
        [valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
        y_valid
        ),
    batch_size=16, 
    epochs=4
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [15]:
score = bilal_bert_model_expanded.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], y_test)

print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.4932304918766022
Test accuracy: 0.7673905491828918


In [16]:
predictions = bilal_bert_model_expanded.predict([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask])
preds = predictions.to_tuple()[0].argmax(1)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.78      0.75      0.77      2992
           1       0.75      0.78      0.77      2902

    accuracy                           0.77      5894
   macro avg       0.77      0.77      0.77      5894
weighted avg       0.77      0.77      0.77      5894

