<a href="https://colab.research.google.com/github/wolffg777/ait_deep_learning/blob/main/SentimentProject_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Stage 0: Installing necessary packages

!pip install bert-for-tf2
!pip install sentencepiece
!pip install transformers
!pip install datasets
!pip install np_utils
!pip install keras



In [11]:
# Stage 1: Importing relevant libraries

# General purpose libraries
import math
import random
import numpy as np
import pandas as pd
import re
from datasets import load_dataset

# Pre-processing libraries
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Tensorflow libraries
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, activations, optimizers, losses
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
# from keras.utils import np_utils
# from keras.utils.np_utils import to_categorical

# BERT libraries
import bert
from transformers import TFBertModel, BertTokenizer, TFBertForSequenceClassification

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Stage 2: Loading the sst5 dataset
  # See https://huggingface.co/datasets/SetFit/sst5 for documentation on the dataset

dataset = load_dataset("SetFit/sst5")

# Viewing the structure of the dataset and practicing indexing into it
# print(dataset)
print(dataset['train'][0]['text'])

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films


In [13]:
# Stage 3: Preparing the train, test, and validation datasets for tokenization

# Separating individual sentences from movie review snippets
# all_sentences = []
train_sentences = []
test_sentences = []

# <= train set fetch =>
for i in range(8543):
        cur_sentence = dataset['train'][i]['text']
        train_sentences.append(cur_sentence)

# <= test set fetch =>
for i in range(2209):
        cur_sentence = dataset['test'][i]['text']
        test_sentences.append(cur_sentence)

# <= validation set fetch =>
# for i in range(1100):
#         cur_sentence = dataset['validation'][i]['text']
#         all_sentences.append(cur_sentence)
# print(len(all_sentences))
# all_sentences[:5]

# Compiling sentiments for each sentence
# all_sentiments = []
train_sentiments = []
test_sentiments = []

# <= train set fetch =>
for i in range(8543):
        cur_sentiment = dataset['train'][i]['label']
        train_sentiments.append(cur_sentiment)

# <= test set fetch =>
for i in range(2209):
        cur_sentiment = dataset['test'][i]['label']
        test_sentiments.append(cur_sentiment)

# <= validation set fetch =>
# for i in range(1100):
#         cur_sentiment = dataset['validation'][i]['label']
#         all_sentiments.append(cur_sentiment)
# print(len(all_sentiments))

print("train set: ", train_sentences[:5])
print("test set: ", test_sentences[:5])

print("test sentiments: ", test_sentiments[:5])
print("train sentiments: ", train_sentiments[:5])

train set:  ['a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films', 'apparently reassembled from the cutting-room floor of any given daytime soap .', "they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science-fiction elements of bug-eyed monsters and futuristic women in skimpy clothes .", 'the entire movie is filled with deja vu moments .', 'this is a visually stunning rumination on love , memory , history and the war between art and commerce .']
test set:  ['no movement , no yuks , not much of anything .', "a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch it up like rancid crème brûlée .", "` how many more voyages can this limping but dearly-loved franchise survive ? '", 'so relentlessly wholesome it made me want to swipe something .', 'gangs of new york is an unapologetic mess , whose only sav

In [14]:
# Stage 4: Removing stop words from the dataset and performing lemmatization

# List of common stop words, curated based on likelihood of contributing to sentiment prediction
english_stop_words_changed = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
 'most', 'other', 'some', 'such', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'now',]

# Algorithm to remove stop words from input sentences
def remove_stop_words(sentence):
    removed_stop_words = []
    for review in sentence:
        removed_stop_words.append(
            ' '.join([word for word in review.split()
                      if word not in english_stop_words_changed])
        )

    return removed_stop_words

# Applying stop word removal
train_sentences_nosw = remove_stop_words(train_sentences)
test_sentences_nosw = remove_stop_words(test_sentences)

print(train_sentences_nosw[:5])

['stirring , funny finally transporting re-imagining beauty beast 1930s horror films', 'apparently reassembled cutting-room floor given daytime soap .', "presume audience wo n't sit still sociology lesson , however entertainingly presented , trot conventional science-fiction elements bug-eyed monsters futuristic women skimpy clothes .", 'entire movie filled deja vu moments .', 'visually stunning rumination love , memory , history war art commerce .']


In [15]:
# Stage 5: Consolidating the inputs and outputs for the two datasets

X_train, Y_train = train_sentences_nosw, train_sentiments
X_test,  Y_test  = test_sentences_nosw, test_sentiments

In [16]:
 # Stage 6: Applying BERT-based tokenization
  # The input to our tokenization function are the pre-processed sentences
  # The output from our tokenization are the input ID's and attention mask used by the BERT model

# Importing the model
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 20 # this should be updated to reflect the dispersion of the dataset

# Setting an arbitrary training sample size
sample = X_train[150]

# Initializing the imported BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# inputs = tokenizer(sample, max_length=MAX_LEN, truncation=True, padding=True)
# print(f'sentence: \'{sample}\'')
# print(f'input ids: {inputs["input_ids"]}')
# print(f'attention mask: {inputs["attention_mask"]}')

# Defining a method to apply tokenization to each sentence in the train and test dataset
def construct_encodings(x, tokenizer, max_len, trucation=True, padding=True):
    return tokenizer(x, max_length=max_len, truncation=trucation, padding=padding)

# Executing the encodings method on the two datasets
encodings_train = construct_encodings(X_train, tokenizer, max_len=MAX_LEN)
encodings_test  = construct_encodings(X_test, tokenizer, max_len=MAX_LEN)

# Previewing the structure of the encodings
# print("train encodings: ", encodings_train["input_ids"])
# print("test encodings: ", encodings_test)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
# Stage 6: Constructing a Tensorflow dataset from the encodings

def construct_tfdataset(encodings, y=None):
    if y is not None:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        return tf.data.Dataset.from_tensor_slices(dict(encodings))

# Y_train = np_utils.to_categorical(Y_train, 5)
# Y_test = np_utils.to_categorical(Y_test, 5)

tfdataset_train = construct_tfdataset(encodings_train, Y_train)
tfdataset_test  = construct_tfdataset(encodings_test, Y_test)

In [18]:
# Stage 7: Performing shuffling on the train and test datasets

# Defining the batch size
BATCH_SIZE = 256

tfdataset_train = tfdataset_train.shuffle(len(X_train))

tfdataset_train = tfdataset_train.take(len(X_train))
tfdataset_test = tfdataset_test.take(len(X_test))

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

In [19]:
# Stage 8: Formally defining the BERT model and fitting it to the dataset

# Initializing the number of epochs
N_EPOCHS = 2

# Loading the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)

# https://stackoverflow.com/questions/74173869/bert-transformer-model-gives-an-error-for-multiclass-classification

# Defining the optimizer and loss function
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)

# Compiling and summarizing the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Fitting the model
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7fa9b0724ca0>

In [20]:
# Stage 9: Evaluating the model performance

results = model.evaluate(tfdataset_train)
print('Model loss:', results[0])
print('Model accuracy:', results[1])

Model loss: 1.1216247081756592
Model accuracy: 0.5259276628494263
