# Import Required Packages

In [1]:
# file management
import os
import shutil
import tarfile

# Pre-Processing packages
import re
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS

# Plotting packages
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# SKlearn
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Tensorflow and Transformers
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# Download the Stanford sentiment analysis dataset

In [2]:
# Get the current working directory
current_folder = os.getcwd()

dataset = tf.keras.utils.get_file(
    fname ="aclImdb.tar.gz",
    origin ="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    cache_dir=  current_folder,
    extract = True)

dataset_path = os.path.dirname(dataset)
print("Dataset downloaded at: ", dataset_path)
print()
# Check the dataset
os.listdir(dataset_path)
dataset_dir = os.path.join(dataset_path, 'aclImdb')
train_dir = os.path.join(dataset_dir,'train')
test_dir = os.path.join(dataset_dir,'test')

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Dataset downloaded at:  /content/datasets



# Load the dataset

In [3]:
def load_dataset(directory):
    data = {"sentence": [], "sentiment": []}
    for file_name in os.listdir(directory):
        print(file_name)
        if file_name == 'pos':
            positive_dir = os.path.join(directory, file_name)
            for text_file in os.listdir(positive_dir):
                text = os.path.join(positive_dir, text_file)
                with open(text, "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(1)
        elif file_name == 'neg':
            negative_dir = os.path.join(directory, file_name)
            for text_file in os.listdir(negative_dir):
                text = os.path.join(negative_dir, text_file)
                with open(text, "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(0)

    return pd.DataFrame.from_dict(data)

# load the data
train_df = load_dataset(train_dir)
test_df = load_dataset(test_dir)

pos
unsup
neg
urls_pos.txt
urls_neg.txt
urls_unsup.txt
unsupBow.feat
labeledBow.feat
pos
neg
urls_pos.txt
urls_neg.txt
labeledBow.feat


In [4]:
# visualize the data distribution

sentiment_counts = train_df['sentiment'].value_counts()

fig =px.bar(x= {0:'Negative',1:'Positive'},
            y= sentiment_counts.values,
            color=sentiment_counts.index,
            color_discrete_sequence =  px.colors.qualitative.Dark24,
            title='<b>Sentiments Counts')

fig.update_layout(title='Sentiments Counts',
                  xaxis_title='Sentiment',
                  yaxis_title='Counts',
                  template='plotly_dark')

# Show the bar chart
#fig.show()
#pyo.plot(fig, filename = 'Sentiments Counts.html', auto_open = True)


# Data Pre-Processing

## Step-1 Text Cleanup

In [5]:
def text_cleaning(text):
    soup = BeautifulSoup(text, "html.parser")
    text = re.sub(r'\[[^]]*\]', '', soup.get_text())
    pattern = r"[^a-zA-Z0-9\s,']"
    text = re.sub(pattern, '', text)
    return text

# Function to generate word cloud
def generate_wordcloud(text,Title):
	all_text = " ".join(text)
	wordcloud = WordCloud(width=800,
						height=400,
						stopwords=set(STOPWORDS),
						background_color='black').generate(all_text)
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.title(Title)
	plt.show()


# Apply Cleaning
train_df['Cleaned_sentence'] = train_df['sentence'].apply(text_cleaning).tolist()
# Test dataset
test_df['Cleaned_sentence'] = test_df['sentence'].apply(text_cleaning)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [6]:
# Positive Words
positive = train_df[train_df['sentiment']==1]['Cleaned_sentence'].tolist()
#generate_wordcloud(positive,'Positive Review')

In [7]:
# Negative Words
negative = train_df[train_df['sentiment']==0]['Cleaned_sentence'].tolist()
#generate_wordcloud(negative,'Negative Review')

In [8]:
# Test, Train, Validation Data

test_reviews = test_df['Cleaned_sentence']
test_targets = test_df['sentiment']

train_reviews = train_df['Cleaned_sentence']
train_target = train_df['sentiment']

x_val, x_test, y_val, y_test = train_test_split(test_reviews, test_targets, test_size=0.5, stratify = test_targets)

## Step-2: BERT Tokenizer and Encoding

In [9]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
max_len= 128
# Tokenize and encode the sentences
X_train_encoded = tokenizer.batch_encode_plus(test_reviews.tolist(), padding=True, truncation=True, max_length = max_len, return_tensors='tf')
X_val_encoded = tokenizer.batch_encode_plus(x_val.tolist(), padding=True, truncation=True, max_length = max_len, return_tensors='tf')
X_test_encoded = tokenizer.batch_encode_plus(x_test.tolist(), padding=True, truncation=True, max_length = max_len, return_tensors='tf')

## Step-3 Check the data

In [11]:
# Raw training data
print('Training Comments -->>',train_reviews[0])

Training Comments -->> This movie is actually FUNNY If you'd like to rest your brain for an hour so then go ahead and watch it It's called blonde and blonder, so don't expect profound and meaningful jokes What this movie and enjoy all the stereotypes we have about two blondes It's just a funny movie to watch on a date or with a company of friends especially if you're not too sober Lol  Pamela and Denise are still pretty hot chicks It's a mistake to judge this movie as a piece of art C'mon, this movie is about BLONDES It's supposed to be light, funny and superficial One more thing, I do not think that girls will appreciate and like this movie but guy definitely will


In [12]:
# Encoded Training data
print('\nInput Ids -->>\n',X_train_encoded['input_ids'][0])


Input Ids -->>
 tf.Tensor(
[  101  1045  2064  1005  1056 21090  2007  1037  3025  7615  2008  4439
  8220  2003  2062  2084  1037  2210  1056 28394  1010  2021  2028  2158
  1005  1055 24265  2003  2178  1005  1055 20380  1010  1045  6814  1999
  2026  2338  2023  2143 21645  2006  5760 11084  1010  2053  2235  8658
  1999  2993  2072  2064  1005  1056  2393  2021  4299  1996  2466  2001
  1037  2210  2625  7511  2445  1996  3815  1997  3772  5848  1999  2009
  1010  2021  2011  1996  2203  1996  5436  3849  2062  2066  1037 18876
  2005  1996  2839 10266  4312  7628 19097  1005  8214 26400  2003  1037
 14013  2135  2058  1996  2327  1998  2058  1996  2940  3883  2295 26400
  8440  1005  1056  2018  6706  2147  1999  2086  1010  2009  1005  1055
 10599  1996  2197  2051  2016  4716  4507   102], shape=(128,), dtype=int32)


In [13]:
# Decoded Data
print('\nDecoded Ids -->>\n',tokenizer.decode(X_train_encoded['input_ids'][0]))


Decoded Ids -->>
 [CLS] i can't disagree with a previous comment that driving lessons is more than a little twee, but one man's indictment is another's endorsement, i suppose in my book this film succeeds on pure charm, no small feat in itselfi can't help but wish the story was a little less conventional given the amount of acting talent in it, but by the end the plot seems more like a backdrop for the character interactions anyway julie walters'dame evie is a gloriously over the top and over the hill actress though evie hasn't had steady work in years, it's unclear the last time she visited reality [SEP]


In [14]:
# Attention Mask
print('\nAttention Mask -->>\n',X_train_encoded['attention_mask'][0])


Attention Mask -->>
 tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], shape=(128,), dtype=int32)


In [15]:
# Labels
print('\nLabels -->>',train_target[0])


Labels -->> 1


# Loading and training the BERT base model

In [16]:
# Intialize the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Compile the model with an appropriate optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Train the model with validation
history = model.fit(
	[X_train_encoded['input_ids'], X_train_encoded['token_type_ids'], X_train_encoded['attention_mask']],
	train_target, validation_data=(
	[X_val_encoded['input_ids'], X_val_encoded['token_type_ids'], X_val_encoded['attention_mask']],y_val),
	batch_size=32,
	epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Model Evaluation

In [18]:
#Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate([X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']], y_test )
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

Test loss: 0.024041440337896347, Test accuracy: 0.9948800206184387


In [19]:
pred = model.predict(
	[X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']])

# pred is of type TFSequenceClassifierOutput
logits = pred.logits

# Use argmax along the appropriate axis to get the predicted labels
pred_labels = tf.argmax(logits, axis=1)

# Convert the predicted labels to a NumPy array
pred_labels = pred_labels.numpy()

label = {
	1: 'positive',
	0: 'Negative'
}

# Map the predicted labels to their corresponding strings using the label dictionary
pred_labels = [label[i] for i in pred_labels]
Actual = [label[i] for i in y_test]

print('Predicted Label :', pred_labels[:10])
print('Actual Label :', Actual[:10])

Predicted Label : ['Negative', 'Negative', 'positive', 'Negative', 'Negative', 'Negative', 'Negative', 'positive', 'positive', 'Negative']
Actual Label : ['Negative', 'Negative', 'positive', 'Negative', 'Negative', 'Negative', 'Negative', 'positive', 'positive', 'Negative']


In [20]:
print("Classification Report: \n", classification_report(Actual, pred_labels))

Classification Report: 
               precision    recall  f1-score   support

    Negative       1.00      0.99      0.99      6250
    positive       0.99      1.00      0.99      6250

    accuracy                           0.99     12500
   macro avg       0.99      0.99      0.99     12500
weighted avg       0.99      0.99      0.99     12500



# Save Model

In [28]:
path = current_folder

# Save tokenizer
tokenizer.save_pretrained(os.path.join(dataset_dir,'Tokenizer'))

# Save model
model.save_pretrained(os.path.join(dataset_dir,'Model'))

# Deploy the Model

In [23]:
# Load tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(os.path.join(dataset_dir,'/Tokenizer'))

# Load model
bert_model = TFBertForSequenceClassification.from_pretrained(os.path.join(dataset_dir,'/Model'))

def Get_sentiment(Review, Tokenizer=bert_tokenizer, Model=bert_model):
	# Convert Review to a list if it's not already a list
	if not isinstance(Review, list):
		Review = [Review]

	Input_ids, Token_type_ids, Attention_mask = Tokenizer.batch_encode_plus(Review,
																			padding=True,
																			truncation=True,
																			max_length=128,
																			return_tensors='tf').values()
	prediction = Model.predict([Input_ids, Token_type_ids, Attention_mask])

	# Use argmax along the appropriate axis to get the predicted labels
	pred_labels = tf.argmax(prediction.logits, axis=1)

	# Convert the TensorFlow tensor to a NumPy array and then to a list to get the predicted sentiment labels
	pred_labels = [label[i] for i in pred_labels.numpy().tolist()]
	return pred_labels

Some layers from the model checkpoint at /Model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /Model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


# Sample Prediction

In [26]:
Review ='''I am happy I have tried out the bert model'''
Get_sentiment(Review)




['positive']

In [29]:
dataset_dir


'/content/datasets/aclImdb'