In [None]:
# pip install tensorflow_text

In [None]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split

from utilize import gen_dataframe

In [None]:
tf.__version__

'2.8.0'

In [None]:
physical_devices = tf.config.list_physical_devices("GPU")
print(physical_devices)

tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
df = gen_dataframe('./data/')
df.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-03-07 14:40:41,Have you missed the origins of covid debate &a...,missed origin covid debate entirely medicine a...
1,2022-03-07 13:40:12,This is another clue that this virus is not na...,another clue virus natural man made lab leak n...
2,2022-03-07 13:00:06,Shhh! This is an example of how reality can da...,shhh example reality damage approved narrative...
3,2022-03-07 06:59:43,“Hard earned American taxpayer dollars should ...,hard earned american taxpayer dollar goingto l...
4,2022-03-07 06:52:10,"@Shoshin41734407 @OpIndia_com ""two biological ...",two biological warfare lab kiev odessa


In [None]:
df.count()

date             24976
tweet            24976
cleaned_tweet    24976
label            24976
dtype: int64

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])



[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def assign_labels(df):
    labels = []
    sia = SentimentIntensityAnalyzer()
    for _, row in df.iterrows():
        scores = sia.polarity_scores(row['tweet'])
        label = 0
        if scores['pos'] < scores['neg']:
            label = 1
        if scores['neg'] == 1:
            label = 1
        labels.append(label)
    df['label'] = labels
    return df

df = assign_labels(df)

print(df.head())
print(df.groupby(['label']).count())

                  date                                              tweet  \
0  2022-03-07 14:40:41  Have you missed the origins of covid debate &a...   
1  2022-03-07 13:40:12  This is another clue that this virus is not na...   
2  2022-03-07 13:00:06  Shhh! This is an example of how reality can da...   
3  2022-03-07 06:59:43  “Hard earned American taxpayer dollars should ...   
4  2022-03-07 06:52:10  @Shoshin41734407 @OpIndia_com "two biological ...   

                                       cleaned_tweet  label  
0  missed origin covid debate entirely medicine a...      1  
1  another clue virus natural man made lab leak n...      1  
2  shhh example reality damage approved narrative...      1  
3  hard earned american taxpayer dollar goingto l...      0  
4             two biological warfare lab kiev odessa      1  
        date  tweet  cleaned_tweet
label                             
0      15071  15071          15071
1       9905   9905           9905


In [None]:
# split dataset to train test
X = df['cleaned_tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16733,), (8243,), (16733,), (8243,))

In [None]:
# preprocessing https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
# encoder https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [None]:
# build functional model
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

preprocessed_text = bert_preprocess(text_inputs)
encoded_text = bert_encoder(preprocessed_text)

layer = tf.keras.layers.Dropout(.1, name='dropout')(encoded_text['pooled_output'])
layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(layer)

model = tf.keras.Model(inputs=[text_inputs], outputs=[layer])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [None]:
log_dir = 'model/log'
checkpoint_filepath = 'model/checkpoint'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [None]:
history = model.fit(X_train,
                    y_train,
                    batch_size=32,
                    epochs=5,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
