<a href="https://colab.research.google.com/github/wolffg7/Sentiment_Prediction/blob/main/SentimentProject_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Stage 0: Installing necessary packages 

!pip install bert-for-tf2
!pip install sentencepiece
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30531 sha256=e1b19abd3994c44bce65d066e19f0aea56f66009187e9538fb0d5eb2a7f4c5df
  Stored in directory: /root/.cache/pip/wheels/6f/c7/91/f2b2c2b3cec30578c5de7c27ac996

In [2]:
# Stage 1: Importing relevant libraries

# General purpose libraries
import math 
import random 
import numpy as np
import pandas as pd 
import re
from datasets import load_dataset

# Pre-processing libraries
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Tensorflow libraries
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

# BERT libraries
import bert
from transformers import TFBertModel, BertTokenizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Stage 2: Loading the sst5 dataset 
  # See https://huggingface.co/datasets/SetFit/sst5 for documentation on the dataset

dataset = load_dataset("SetFit/sst5") 

# Viewing the structure of the dataset and practicing indexing into it
print(dataset) 
print(dataset['train'][0]['text']) 

Downloading readme:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading and preparing dataset json/SetFit--sst5 to /root/.cache/huggingface/datasets/SetFit___json/SetFit--sst5-4c07b9d5881ae209/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/SetFit___json/SetFit--sst5-4c07b9d5881ae209/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 8544
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2210
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 1101
    })
})
a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films


In [4]:
# Stage 3: Preparing the train, test, and validation datasets for tokenization

# Separating individual sentences from movie review snippets
all_sentences = [] 

# <= train set fetch =>
for i in range(8543): 
        cur_sentence = dataset['train'][i]['text'] 
        all_sentences.append(cur_sentence) 

# <= test set fetch =>
for i in range(2209): 
        cur_sentence = dataset['test'][i]['text'] 
        all_sentences.append(cur_sentence) 

# <= validation set fetch =>
for i in range(1100): 
        cur_sentence = dataset['validation'][i]['text'] 
        all_sentences.append(cur_sentence) 
print(len(all_sentences))
all_sentences[:5]

# Compiling sentiments for each sentence
all_sentiments = [] 

# <= train set fetch => 
for i in range(8543): 
        cur_sentiment = dataset['train'][i]['label'] 
        all_sentiments.append(cur_sentiment) 

# <= test set fetch =>
for i in range(2209): 
        cur_sentiment = dataset['test'][i]['label'] 
        all_sentiments.append(cur_sentiment) 

# <= validation set fetch =>
for i in range(1100): 
        cur_sentiment = dataset['validation'][i]['label'] 
        all_sentiments.append(cur_sentiment) 
print(len(all_sentiments))
all_sentiments[:5]

11852
11852


[4, 1, 1, 2, 3]

In [6]:
# Stage 4: Removing stop words from the dataset and performing lemmatization

# List of common stop words, curated based on likelihood of contributing to sentiment prediction
english_stop_words_changed = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 
 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 
 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 
 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
 'most', 'other', 'some', 'such', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'now',]

# Algorithm to remove stop words from input sentences
def remove_stop_words(sentence):
    removed_stop_words = []
    for review in sentence:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words_changed])
        )

    return removed_stop_words

# Algorithm to lemmatize words from input sentences
def apply_lemmatization(input_array):
  lemmatizer = WordNetLemmatizer()
  lemmatized_sentences = []
  for sentence in input_array:
      tokenized_sentence = nltk.word_tokenize(sentence)
      lemmatized_sentence = ' '.join([lemmatizer.lemmatize(word) for word in tokenized_sentence])
      lemmatized_sentences.append(lemmatized_sentence)
  
  return lemmatized_sentences

# Applying stop word removal and lemmatization
sentences_nosw = remove_stop_words(all_sentences) 
sentences_nosw = apply_lemmatization(sentences_nosw)

# Printing modified sentences
# print(sentences_nosw)


In [7]:
 # Stage 5: Applying BERT-based tokenization
  # The input to our tokenization function are the pre-processed sentences stored in the variable 'sentences_nosw'
  # The output from our tokenization are the input ID's and attention mask used by the BERT model

# Initializing the BERT tokenizer
imported_tokenizer = bert.bert_tokenization.FullTokenizer
pre_trained_bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
vocabulary = pre_trained_bert.resolved_object.vocab_file.asset_path.numpy()
lower_case = pre_trained_bert.resolved_object.do_lower_case.numpy()
modified_tokenizer = BertTokenizer(vocabulary, lower_case)

# Applying the tokenizer function to each sentence
def tokenize_sentences(input_sentences):
    return modified_tokenizer.convert_tokens_to_ids(modified_tokenizer.tokenize(input_sentences))

tokenized_sentences = [tokenize_sentences(sentence) for sentence in sentences_nosw]

print(tokenized_sentences[:2])

# Binding sentiments to the tokenized sequences
all_sentiments = np.array(all_sentiments) 

sentences_with_sentiment = [[review, all_sentiments[i]]
                 for i, review in enumerate(tokenized_sentences)] 

print(sentences_with_sentiment[:5]) 

[[18385, 1010, 6057, 2633, 18276, 2128, 1011, 16603, 5053, 6841, 5687, 5469, 2143], [4593, 2128, 27241, 23931, 6276, 1011, 2282, 2723, 2445, 12217, 7815, 1012]]
[[[18385, 1010, 6057, 2633, 18276, 2128, 1011, 16603, 5053, 6841, 5687, 5469, 2143], 4], [[4593, 2128, 27241, 23931, 6276, 1011, 2282, 2723, 2445, 12217, 7815, 1012], 1], [[3653, 23545, 4378, 24185, 1050, 1005, 1056, 4133, 2145, 11507, 10800, 1010, 2174, 14036, 2135, 3591, 1010, 19817, 4140, 7511, 2671, 1011, 4349, 5783, 11829, 1011, 7168, 6071, 28971, 2450, 8301, 8737, 2100, 4253, 1012], 1], [[2972, 3185, 3561, 2139, 3900, 24728, 2617, 1012], 2], [[17453, 14726, 19379, 12758, 2293, 1010, 3638, 1010, 2381, 2162, 2396, 6236, 1012], 3]]


In [8]:
 # Stage 6: Distributing padding to the array of sentences with sentiment labels

# Calculating the longest sentence found in the sst5 database (after pre-processing)
max_sentence_length = max([len(x[0]) for x in sentences_with_sentiment]) 
print(max_sentence_length)  

# Initializing data structures for the padded sequences and attention mask
sentences_with_padding = [] 
attention_masks = [] 

# Applying an appropriate amount of padding respective to each sentence
for sentence in sentences_with_sentiment:
  sentence_with_padding = pad_sequences([sentence[0]], maxlen=max_sentence_length, padding='post')[0] # It may be wortwhile to modify the padding type
  sentiment = sentence[1]
  sentences_with_padding.append([(sentence_with_padding).astype('float32'), sentiment.astype('float32')])  

# Building an attention mask to help the BERT model distinguish between padding and non-padding
for sentence in sentences_with_sentiment:
  attention_mask = ([1] * len(sentence[0])) + ([0] *  (max_sentence_length - len(sentence[0])))  
  attention_masks.append(np.array(attention_mask).astype('float32'))

# Transforming the data type of the padded sentences and attention mask 
sentences_with_padding = np.array(sentences_with_padding)
attention_masks = np.array(attention_mask)

# Viewing the structure of the padded sentences and attention masks
print(sentences_with_padding[:1]) 
print(attention_masks[:1])

62
[[array([18385.,  1010.,  6057.,  2633., 18276.,  2128.,  1011., 16603.,
          5053.,  6841.,  5687.,  5469.,  2143.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
             0.,     0.,     0.,     0.,     0.,     0.], dtype=float32)
  4.0]]
[1]


  sentences_with_padding = np.array(sentences_with_padding)


In [None]:
# Stage 7: Formally defining the BERT model and fitting it to the dataset

# Loading the pre-trained BERT model
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Passing the attention mask and input ID's into the model
sentences_with_padding = Input(shape=(max_sentence_length,), dtype=tf.int32, name="input_ids")
attention_masks = Input(shape=(max_sentence_length,), dtype=tf.int32, name="attention_masks")
output_from_model = bert({'input_ids': sentences_with_padding, 'attention_mask': attention_masks})[1]

# Defining architecture of layers
dense = Dense(64, activation='relu')(output_from_model)
dropout = Dropout(0.2)(dense)
output = Dense(1, activation='sigmoid')(dropout)

# Compiling and summarizing the model
model = Model(inputs=[sentences_with_padding, attention_masks], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Splitting the input into test, train, and validation subsets
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_sentiments, test_sentiments = train_test_split(
    sentences_with_padding, attention_masks, all_sentiments, test_size=0.15, random_state=45
) 

# Converting the data types of the data subsets into Tensors for input into the model
train_input_ids = tf.convert_to_tensor(train_input_ids)
train_attention_masks = tf.convert_to_tensor(train_attention_masks)
train_sentiments = tf.convert_to_tensor(train_sentiments)
test_input_ids = tf.convert_to_tensor(test_input_ids)
test_attention_masks = tf.convert_to_tensor(test_attention_masks)
test_sentiments = tf.convert_to_tensor(test_sentiments)

# Consolidating train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_masks, train_sentiments))
train_dataset = train_dataset.batch(batch_size=32)

# Fitting the model
model.fit(
    train_input_ids, train_attention_masks, train_sentiments,
    validation_data=(test_input_ids, test_attention_masks, test_sentiments),
    epochs=12, verbose = 1 
)

In [None]:
# Stage 8: Evaluating the model performance

results = model.evaluate(train_dataset)
print('Model loss:', results[0])
print('Model accuracy:', results[1])