In [26]:
!pip install bert-tensorflow
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

import re 

import nltk
from nltk.corpus import stopwords

import random
from sklearn import metrics, model_selection, preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from transformers import BertTokenizerFast, TFBertModel, BertForSequenceClassification, TrainingArguments, Trainer
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

import tensorflow as tf

In [28]:
nltk.download("punkt")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import csv

df = pd.read_csv("TweetsCOV19.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213713 entries, 0 to 213712
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    213713 non-null  int64 
 1   TweetId       213713 non-null  int64 
 2   Username      213713 non-null  object
 3   Timestamp     213713 non-null  object
 4   NoFollowers   213713 non-null  int64 
 5   NoFriends     213713 non-null  int64 
 6   NoRetweets    213713 non-null  int64 
 7   NoFavorites   213713 non-null  int64 
 8   Entities      213713 non-null  object
 9   Sentiment     213713 non-null  object
 10  Mentions      213204 non-null  object
 11  Hashtags      213647 non-null  object
 12  URLs          213713 non-null  object
 13  TweetText     213713 non-null  object
 14  UserLocation  162119 non-null  object
dtypes: int64(6), object(9)
memory usage: 24.5+ MB


In [30]:
df = df.sample(frac = 0.025)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5343 entries, 133425 to 12084
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    5343 non-null   int64 
 1   TweetId       5343 non-null   int64 
 2   Username      5343 non-null   object
 3   Timestamp     5343 non-null   object
 4   NoFollowers   5343 non-null   int64 
 5   NoFriends     5343 non-null   int64 
 6   NoRetweets    5343 non-null   int64 
 7   NoFavorites   5343 non-null   int64 
 8   Entities      5343 non-null   object
 9   Sentiment     5343 non-null   object
 10  Mentions      5329 non-null   object
 11  Hashtags      5340 non-null   object
 12  URLs          5343 non-null   object
 13  TweetText     5343 non-null   object
 14  UserLocation  4099 non-null   object
dtypes: int64(6), object(9)
memory usage: 667.9+ KB


In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,TweetId,Username,Timestamp,NoFollowers,NoFriends,NoRetweets,NoFavorites,Entities,Sentiment,Mentions,Hashtags,URLs,TweetText,UserLocation
133425,133425,1213777806649217024,d9353c92a3ee5ac328546c2544df4721,2020-01-05 11:02:34+00:00,779,818,0,0,null;,1 -1,null;,null;,https://www.twittascope.com/?sign=5:-:,You can rely on a flow between thought and act...,"Fort Lauderdale, FL"
117717,117717,1204013299127316480,3ecec78ecd2512ebf73361e8051ee642,2019-12-09 12:21:54+00:00,176721,34,18,38,null;,2 -1,null;,null;,null;,"hi, everything is pointless and only looks to ...",forest
17209,17209,1184863097057619969,3fed8d6f459a458ed76e8c4fbe26ceca,2019-10-17 16:05:50+00:00,16603,0,93,1386,null;,3 -1,null;,bbnaija,null;,Remember the Munch It tasks that happened in t...,Nigeria
11644,11644,1185178500203868160,9e12d5e2bcfd5d63f5ea9a567873b58b,2019-10-18 12:59:08+00:00,2449,3253,0,0,null;,3 -1,laurachina00,null;,null;,@laurachina00 Hope so😭😭🙏🙏,12/17/22
49231,49231,1196376641783513088,19f2d5d95e10b6a4f808633a3608ac5f,2019-11-18 10:36:33+00:00,3465,3766,0,0,null;,1 -1,null;,null;,https://www.fizzicseducation.com.au/category/s...,Teachers - planning 2020 scope &amp; sequences...,"NSW, QLD, ACT & Victoria"


In [33]:
# Check for missing values

print(df.isnull().sum())

Unnamed: 0         0
TweetId            0
Username           0
Timestamp          0
NoFollowers        0
NoFriends          0
NoRetweets         0
NoFavorites        0
Entities           0
Sentiment          0
Mentions          14
Hashtags           3
URLs               0
TweetText          0
UserLocation    1244
dtype: int64


In [34]:
df.dropna(subset=['UserLocation'], inplace=True)

In [35]:
df.head()

Unnamed: 0.1,Unnamed: 0,TweetId,Username,Timestamp,NoFollowers,NoFriends,NoRetweets,NoFavorites,Entities,Sentiment,Mentions,Hashtags,URLs,TweetText,UserLocation
133425,133425,1213777806649217024,d9353c92a3ee5ac328546c2544df4721,2020-01-05 11:02:34+00:00,779,818,0,0,null;,1 -1,null;,null;,https://www.twittascope.com/?sign=5:-:,You can rely on a flow between thought and act...,"Fort Lauderdale, FL"
117717,117717,1204013299127316480,3ecec78ecd2512ebf73361e8051ee642,2019-12-09 12:21:54+00:00,176721,34,18,38,null;,2 -1,null;,null;,null;,"hi, everything is pointless and only looks to ...",forest
17209,17209,1184863097057619969,3fed8d6f459a458ed76e8c4fbe26ceca,2019-10-17 16:05:50+00:00,16603,0,93,1386,null;,3 -1,null;,bbnaija,null;,Remember the Munch It tasks that happened in t...,Nigeria
11644,11644,1185178500203868160,9e12d5e2bcfd5d63f5ea9a567873b58b,2019-10-18 12:59:08+00:00,2449,3253,0,0,null;,3 -1,laurachina00,null;,null;,@laurachina00 Hope so😭😭🙏🙏,12/17/22
49231,49231,1196376641783513088,19f2d5d95e10b6a4f808633a3608ac5f,2019-11-18 10:36:33+00:00,3465,3766,0,0,null;,1 -1,null;,null;,https://www.fizzicseducation.com.au/category/s...,Teachers - planning 2020 scope &amp; sequences...,"NSW, QLD, ACT & Victoria"


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4099 entries, 133425 to 12084
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    4099 non-null   int64 
 1   TweetId       4099 non-null   int64 
 2   Username      4099 non-null   object
 3   Timestamp     4099 non-null   object
 4   NoFollowers   4099 non-null   int64 
 5   NoFriends     4099 non-null   int64 
 6   NoRetweets    4099 non-null   int64 
 7   NoFavorites   4099 non-null   int64 
 8   Entities      4099 non-null   object
 9   Sentiment     4099 non-null   object
 10  Mentions      4086 non-null   object
 11  Hashtags      4097 non-null   object
 12  URLs          4099 non-null   object
 13  TweetText     4099 non-null   object
 14  UserLocation  4099 non-null   object
dtypes: int64(6), object(9)
memory usage: 512.4+ KB


In [37]:
# We drop all columns other than Sentiment and TweetText since they are irrelevant to this task

df = df.drop(["Unnamed: 0", "TweetId", "Username", "Timestamp", "NoFollowers", "NoFriends", "NoRetweets", "NoFavorites", "Entities", "Mentions", "Hashtags", "URLs"], axis = 1)

In [38]:
(df["Sentiment"].iloc[0]).split(" ")

['1', '-1']

In [39]:
def calc_sentm(col):
  sent_list = (col).split(" ")
  # print(sent_list)
  sum = np.sum([int(sent_list[0]), int(sent_list[1])])

  if sum > 0:
    temp = "positive"
  elif sum < 0:
    temp = "negative"
  else:
    temp = "neutral"
  
  return temp

In [40]:
df["Sentiment"] = df["Sentiment"].map(calc_sentm)

In [41]:
df.head()

Unnamed: 0,Sentiment,TweetText,UserLocation
133425,neutral,You can rely on a flow between thought and act...,"Fort Lauderdale, FL"
117717,positive,"hi, everything is pointless and only looks to ...",forest
17209,positive,Remember the Munch It tasks that happened in t...,Nigeria
11644,positive,@laurachina00 Hope so😭😭🙏🙏,12/17/22
49231,neutral,Teachers - planning 2020 scope &amp; sequences...,"NSW, QLD, ACT & Victoria"


In [42]:
def preprocess_tweets(tweet):
  # Convert all words to lower case
  tweet = tweet.lower()

  # Remove all punctuations
  tweet = tweet.translate(str.maketrans('', '', string.punctuation))

  # Remove all stop words
  # Tokenize the sentence
  tokens = nltk.word_tokenize(tweet)
  # Remove stop words
  clean_tokens = [token for token in tokens if not token.lower() in stop_words]
  # Join the tokens back into a sentence
  tweet = ' '.join(clean_tokens)

  # Removing HTML Tags, URLs and Website Links
  # Removing Website Links
  temp_text = tweet.split()

  for i in temp_text:
    if i[-4:] == ".com" or i[:4] == "www.":
      temp_text.remove(i)

  tweet = ' '.join(temp_text)

  # Remove HTML tags
  clean_text = re.sub('<[^<]+?>', '', tweet)

  # Remove URLs
  clean_text = re.sub(r'http\S+', '', clean_text)

  tweet = clean_text

  # Remove numbers
  tweet = re.sub(r'\d+', '', tweet)
  
  # Remove mentions
  tweet = re.sub(r'@\w+', '', tweet)

  return tweet  


In [43]:
df["TweetText"] = df["TweetText"].map(preprocess_tweets)

In [44]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows 
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['Sentiment']= label_encoder.fit_transform(df['Sentiment'])
  
df['Sentiment'].unique()

array([1, 2, 0])

In [45]:
df.head()

Unnamed: 0,Sentiment,TweetText,UserLocation
133425,1,rely flow thought action appears leo,"Fort Lauderdale, FL"
117717,2,hi everything pointless looks leave impression...,forest
17209,2,remember munch tasks happened bbnaija house im...,Nigeria
11644,2,laurachina hope so😭😭🙏🙏,12/17/22
49231,1,teachers planning scope amp sequences use cur...,"NSW, QLD, ACT & Victoria"


In [46]:
# Split the data into train and validation sets

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [47]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [48]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
# Prepare the data
# train_data and validation_data are pandas dataframes with columns 'text' and 'label'

# Convert data into InputExample format
train_examples = train.apply(lambda x: InputExample(guid=None, text_a = x['TweetText'], label = x['Sentiment']), axis = 1)
val_examples = val.apply(lambda x: InputExample(guid=None, text_a = x['TweetText'], label = x['Sentiment']), axis = 1)

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(gen, ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

# Convert examples into tensorflow dataset
train_dataset = convert_examples_to_tf_dataset(list(train_examples), tokenizer).shuffle(100).batch(32).repeat(2)
validation_dataset = convert_examples_to_tf_dataset(list(val_examples), tokenizer).batch(64)



In [50]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

# Train the model
model.fit(train_dataset, epochs=5, validation_data=validation_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f30e9c22e00>

In [51]:
test_examples = test.apply(lambda x: InputExample(guid=None, text_a = x['TweetText'], label = x['Sentiment']), axis = 1)
test_dataset = convert_examples_to_tf_dataset(list(test_examples), tokenizer).batch(64)



In [54]:
loss, accuracy = model.evaluate(test_dataset)



In [53]:
predictions = model.predict(test_dataset)



In [60]:
predictions

TFSequenceClassifierOutput(loss=None, logits=array([[-3.0706558,  3.5679653, -0.1300736],
       [-3.2201633,  1.2292706,  1.994171 ],
       [-3.1255264, -1.9361776,  5.1832542],
       ...,
       [-2.2754524,  5.033509 , -2.2983212],
       [-1.5996678,  3.5430825, -1.3133626],
       [-2.9857552, -2.1015275,  5.2899265]], dtype=float32), hidden_states=None, attentions=None)

In [62]:
# The logits are the first item in this object
logits = predictions[0]

# Convert logits to probabilities via softmax
probabilities = tf.nn.softmax(logits, axis=-1)

# Get the class with the highest probability
integer_predictions = tf.argmax(probabilities, axis=-1)

# Convert tensor to numpy array if needed
integer_predictions = integer_predictions.numpy()

In [75]:
integer_predictions[:200]

array([1, 2, 2, 2, 1, 1, 0, 2, 0, 0, 1, 2, 2, 1, 0, 1, 1, 1, 0, 2, 0, 1,
       2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 1, 2, 1, 2, 2, 1, 2,
       2, 1, 2, 2, 0, 1, 1, 1, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 2, 1, 0,
       0, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 0, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 1, 2, 2, 2, 2, 1, 0, 1, 2, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1,
       1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2, 1,
       1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2,
       2, 1, 0, 1, 1, 2, 2, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 2, 2,
       1, 1, 2, 1, 2, 0, 1, 0, 1, 0, 0, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2,
       2, 1])

In [64]:
df.head()

Unnamed: 0,Sentiment,TweetText,UserLocation
133425,1,rely flow thought action appears leo,"Fort Lauderdale, FL"
117717,2,hi everything pointless looks leave impression...,forest
17209,2,remember munch tasks happened bbnaija house im...,Nigeria
11644,2,laurachina hope so😭😭🙏🙏,12/17/22
49231,1,teachers planning scope amp sequences use cur...,"NSW, QLD, ACT & Victoria"


In [66]:
test["Predicted Sentiments"] = integer_predictions

In [73]:
test

Unnamed: 0,Sentiment,TweetText,UserLocation,Predicted Sentiments
8967,0,know antivirus software wont protect biggest t...,"Winona, MN 55987",1
180774,2,public believe happened national capital impul...,New Delhi,2
175608,2,anf act like nothing happen,Since 2018. Rules:,2
124857,2,txs size mb stripped mb time reward btc ...,Moon,2
130885,0,nayeon annoyed chaeyoung penguin hand puppet 😂,"New York, USA",1
...,...,...,...,...
39253,2,omg smile turned like smirk ’ whipped man call...,181227,2
172959,2,primark sells harry potter cushions £ shoppers...,UK,2
188666,2,authors franwilde molder marthawells curtiscch...,NYC,1
91609,1,tune secs countdown happening experience mass...,James' ❤ (@tellemjaye),1
