In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer,TFBertMainLayer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('dataset/train.tsv', sep = '\t')[:50000]
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    50000 non-null  int64 
 1   SentenceId  50000 non-null  int64 
 2   Phrase      50000 non-null  object
 3   Sentiment   50000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [4]:
df['Phrase'].str.len()

0        188
1         77
2          8
3          1
4          6
        ... 
49995     10
49996     57
49997      7
49998     49
49999     47
Name: Phrase, Length: 50000, dtype: int64

In [5]:
df['Phrase'].str.len().max()

279

In [6]:
df['Phrase'].str.len().mean()

37.036

In [7]:
df['Sentiment'].value_counts()

Sentiment
2    26581
3    10423
1     8214
4     2746
0     2036
Name: count, dtype: int64

The sentiment lables are:<hr> <p>0 - Negative</p>  <p>1 - Somewhat neagtive</p>  <p>2 - neutral</p> <p>3 - somewhat positive</p>  <p>4 - positive</p>

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt: 100%|██████████████████████████████████████████| 213k/213k [00:00<00:00, 8.54MB/s]
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████| 29.0/29.0 [00:00<00:00, 1.86kB/s]
Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████| 570/570 [00:00<00:00, 36.5kB/s]


In [12]:
df['Phrase'][0]

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [13]:
#Max Length
maxLength = 128

In [14]:
token = tokenizer.encode_plus(
    df['Phrase'].iloc[0], 
    max_length=maxLength, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [15]:
type(token)

transformers.tokenization_utils_base.BatchEncoding

In [16]:
token.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [17]:
token['input_ids']

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[  101,   138,  1326,  1104, 13936, 25265, 16913, 15107,  1103,
         8050,  2553,  1115,  1184,  1110,  1363,  1111,  1103, 20398,
         1110,  1145,  1363,  1111,  1103,   176,  9900,   117,  1199,
         1104,  1134,  5411,  1821, 14225,  1133,  3839,  1104,  1134,
         7919,  1106,  1277,  1104,   170,  1642,   119,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [18]:
token['token_type_ids']

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [19]:
token['attention_mask']

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [20]:
X_input_ids = np.zeros((len(df), maxLength))
X_attn_masks = np.zeros((len(df), maxLength))

In [21]:
X_input_ids.shape

(50000, 128)

In [22]:
X_attn_masks.shape

(50000, 128)

In [23]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Phrase'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [24]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

50000it [00:24, 2064.30it/s]


In [25]:
labels = np.zeros((len(df), 5 ))

In [26]:
labels.shape

(50000, 5)

In [27]:
labels[np.arange(len(df)), df['Sentiment'].values] = 1 # one-hot encoded target tensor

In [28]:
labels

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]])

In [29]:
print(np.arange(len(df)),df['Sentiment'].values)

[    0     1     2 ... 49997 49998 49999] [1 2 2 ... 1 4 4]


In [30]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [31]:
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(128,), dtype=tf.float64, name=None), TensorSpec(shape=(128,), dtype=tf.float64, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [32]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [33]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [34]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(128,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(128,), dtype=tf.float64, name=None)}, TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [35]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [36]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 128), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 128), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

In [37]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [38]:
train_size

2500

In [39]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

<h2>Model</h2>

In [40]:
from transformers import TFBertModel

In [41]:
model = TFBertModel.from_pretrained('bert-base-cased')

Downloading model.safetensors: 100%|████████████████████████████████████████████████| 436M/436M [00:05<00:00, 75.7MB/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your t

In [42]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(maxLength,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(maxLength,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(256, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [43]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [44]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [45]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs = 10
)

Epoch 1/10
 159/2500 [>.............................] - ETA: 20:46 - loss: 1.1611 - accuracy: 0.5326


KeyboardInterrupt



In [46]:
sentiment_model.save('sentiment_model')



INFO:tensorflow:Assets written to: sentiment_model\assets


INFO:tensorflow:Assets written to: sentiment_model\assets


In [47]:
#saving model in h5 format
sentiment_model.save('sentiment_model.h5')

<h2> Full Code with Tensor Pipeline</h2>

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertMainLayer, TFBertModel

# Reading in a TSV file containing training data and selecting only the first 50,000 rows
df = pd.read_csv('dataset/train.tsv', sep='\t')[:100000]

# Creating a tokenizer object from the BERT-base-cased model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Setting the maximum sequence length for the tokenized input
maxLength = 128

# Defining a function to generate the training data
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['Phrase'])):
        # Tokenizing the input text using the BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        # Storing the tokenized input and attention masks in numpy arrays
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids = np.zeros((len(df), maxLength), dtype=np.int32)
X_attn_masks = np.zeros((len(df), maxLength), dtype=np.int32)

# Generating the training data using the generate_training_data function
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

# Convert the labels to one-hot encoded vectors
labels = tf.keras.utils.to_categorical(df['Sentiment'], num_classes=5)

# Creating a TensorFlow dataset object from the input and target data
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

# Defining a function to map the input and target data to the required format for the TensorFlow dataset
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

# Mapping the dataset to the required format using the SentimentDatasetMapFunction
dataset = dataset.map(SentimentDatasetMapFunction)

# Shuffling and batching the dataset for training the model
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)

# Splitting the dataset into training and validation sets
p = 0.8
train_size = int((len(df) // 16) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Creating a BERT model object and defining the input layers
model = TFBertModel.from_pretrained('bert-base-cased')
input_ids = tf.keras.layers.Input(shape=(maxLength,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(maxLength,), name='attention_mask', dtype='int32')

# Feeding the input layers to the BERT model and adding a dense layer and output layer
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(256, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(intermediate_layer)

# Creating a TensorFlow model object with the input and output layers
sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)

optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs = 10
)

  from .autonotebook import tqdm as notebook_tqdm
100000it [00:42, 2329.50it/s]
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10
  27/5000 [..............................] - ETA: 42:50 - loss: 1.3676 - accuracy: 0.4815

<h2>Prediction</h2>

In [46]:
# sentiment_model = tf.keras.models.load_model('sentiment_model')
sentiment_model = tf.keras.models.load_model('sentiment_model.h5', custom_objects={"TFBertMainLayer":TFBertMainLayer})

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=maxLength, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['Negative', 'A bit negative', 'Neutral', 'A bit positive', 'Positive']):
    probs = model.predict(processed_data)[0]
    threshold = 0.50
    if np.max(probs) < threshold:
        prediction = 'Fallback Class'
    else:
        prediction = classes[np.argmax(probs)]
    return prediction

In [47]:
maxLength = 128
input_text = input('Enter movie review here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data)
print(f"Predicted Sentiment: {result}")

Enter movie review here:  its a good movie


Predicted Sentiment: A bit positive


In [5]:
#Predicting without function
maxLength = 128
input_text = "very very good movie"
token = tokenizer.encode_plus(input_text, max_length=maxLength,truncation=True,padding='max_length', add_special_tokens=True,return_tensors='tf')
processed_data= {'input_ids': tf.cast(token.input_ids, tf.float64),'attention_mask': tf.cast(token.attention_mask, tf.float64)}
classes=['Negative', 'A bit negative', 'Neutral', 'A bit positive', 'Positive']
probs = sentiment_model.predict(processed_data)[0]
# Set a threshold below which all predictions will be considered as the fallback class
threshold = 0.55
if np.max(probs) < threshold:
    prediction = 'Fallback Class'
else:
    prediction = classes[np.argmax(probs)]
print(probs)
print(prediction)

[3.3538698e-04 4.3500261e-04 4.0685800e-03 9.5507726e-02 8.9965338e-01]
Positive
