In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip '/content/drive/MyDrive/archive.zip'

Archive:  /content/drive/MyDrive/archive.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
import pandas as pd
import numpy as np
import re
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Lambda, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text


df_smile = pd.read_csv('/content/drive/MyDrive/smile-annotations-final.csv', names=['id', 'text', 'category'])
df_smile['text'] = df_smile['text'].apply(clean_text)
sentiment_map = {
    'angry': 'Negative', 'disgust': 'Negative', 'disgust|angry': 'Negative',
    'happy': 'Positive', 'happy|sad': 'Positive',
    'nocode': 'Neutral', 'not-relevant': 'Neutral',
    'sad': 'Negative', 'sad|angry': 'Negative', 'sad|disgust': 'Negative',
    'sad|disgust|angry': 'Negative', 'surprise': 'Neutral'
}
df_smile['category'] = df_smile['category'].map(sentiment_map)


df_s140 = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])
df_s140['text'] = df_s140['text'].apply(clean_text)
df_s140 = df_s140[['text', 'target']]
df_s140['target'] = df_s140['target'].map({0: 'Negative', 2: 'Neutral', 4: 'Positive'})


df_combined = pd.concat([df_smile[['text', 'category']], df_s140.rename(columns={'target': 'category'})], ignore_index=True)
print("Previous number of rows in the DataFrame:", len(df_combined))
df_combined.dropna(subset=['category'], inplace=True)


df_combined.reset_index(drop=True, inplace=True)


print("Missing values in 'category' after removal:", df_combined['category'].isna().sum())
print("Updated number of rows in the DataFrame:", len(df_combined))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s140['target'] = df_s140['target'].map({0: 'Negative', 2: 'Neutral', 4: 'Positive'})


Previous number of rows in the DataFrame: 1603085
Missing values in 'category' after removal: 0
Updated number of rows in the DataFrame: 1603074


In [6]:
df_combined.head()

Unnamed: 0,text,category
0,aandraous britishmuseum andrewsantonio merci p...,Neutral
1,dorian gray with rainbow scarf lovewins from b...,Positive
2,selectshowcase tatestives replace with your w...,Positive
3,sofabsports thank you for following me back gr...,Positive
4,britishmuseum tudorhistory what a beautiful je...,Positive


In [7]:

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df_combined['category'])
labels = to_categorical(labels)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [8]:
def encode_texts(texts, max_length=64):
    return tokenizer.batch_encode_plus(texts, padding=True, truncation=True, max_length=max_length, return_tensors="tf")

encoded_texts = encode_texts(df_combined['text'].tolist())
input_ids = encoded_texts['input_ids'].numpy()
attention_mask = encoded_texts['attention_mask'].numpy()

In [15]:
#We ran this code to find the shape of the output
import tensorflow as tf


dummy_input_ids = tf.constant([[0]*128])
dummy_attention_mask = tf.constant([[1]*128])

bert_outputs = bert_model([dummy_input_ids, dummy_attention_mask])
print("Output shape from BERT:", bert_outputs[0].shape)


Output shape from BERT: (1, 128, 768)


In [10]:

def build_model():
    input_ids_layer = Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask_layer = Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    bert_output = Lambda(lambda x: bert_model(x)[0][:,0],
                        output_shape=(768,))([input_ids_layer, attention_mask_layer])
    dropout_layer = Dropout(0.3)(bert_output)
    classifier_layer = Dense(len(label_encoder.classes_), activation='softmax')(dropout_layer)
    model = Model(inputs=[input_ids_layer, attention_mask_layer], outputs=classifier_layer)
    return model
model = build_model()



The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'tf_bert_model/bert/embeddings/word_embeddings/weight:0' shape=(30522, 768) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/embeddings/token_type_embeddings/embeddings:0' shape=(2, 768) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/embeddings/position_embeddings/embeddings:0' shape=(512, 768) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/embeddings/LayerNorm/gamma:0' shape=(768,) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/embeddings/LayerNorm/beta:0' shape=(768,) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/encoder/layer_._0/attention/self/query/kernel:0' shape=(768, 768) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/encoder/layer_._0/attention/self/query/bias:0' shape=(768,) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/encoder/layer_._0/attention/self/key/kernel:0' shape=(768, 768) dtype=float32>
  <tf.Variable 'tf_bert_model/bert/

In [12]:
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
import re
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Lambda, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

model.load_weights('/content/drive/MyDrive/BertModel.weights.h5')


In [13]:
def predict_sentiment(texts, tokenizer, model, max_length=64):
    texts = [clean_text(text) for text in texts]
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
    input_ids = encoded_inputs['input_ids']
    attention_masks = encoded_inputs['attention_mask']


    predictions = model.predict([input_ids, attention_masks])
    predictions = tf.nn.softmax(predictions, axis=-1)
    predicted_class_indices = tf.argmax(predictions, axis=1).numpy()
    labels = ['Negative', 'Neutral', 'Positive']
    predicted_labels = [labels[idx] for idx in predicted_class_indices]
    return predicted_labels

In [14]:
# we can add more texts here as needed
test_texts = [
    "I love this product! Absolutely wonderful!",
    "This is the worst experience I ever had.",
    "I think this is okay, not too bad but could be better.",
    "The two works in our free #DuccioCaro exhibition play with space, movement, and architect... http://t.co/wdV2lzmgLu via @NationalGallery",
    "@aandraous @britishmuseum @AndrewsAntonio Merci pour le partage! @openwinemap",
    "@britishmuseum say wot, mate?"
]

predicted_sentiments = predict_sentiment(test_texts, tokenizer, model)
for text, sentiment in zip(test_texts, predicted_sentiments):
    print(f'Text: {text}\nPredicted Sentiment: {sentiment}\n')

Text: I love this product! Absolutely wonderful!
Predicted Sentiment: Positive

Text: This is the worst experience I ever had.
Predicted Sentiment: Negative

Text: I think this is okay, not too bad but could be better.
Predicted Sentiment: Negative

Text: The two works in our free #DuccioCaro exhibition play with space, movement, and architect... http://t.co/wdV2lzmgLu via @NationalGallery
Predicted Sentiment: Positive

Text: @aandraous @britishmuseum @AndrewsAntonio Merci pour le partage! @openwinemap
Predicted Sentiment: Neutral

Text: @britishmuseum say wot, mate?
Predicted Sentiment: Neutral

