In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split

from utilize import gen_dataframe

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
physical_devices = tf.config.list_physical_devices("GPU")
print(physical_devices)

tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
df = gen_dataframe('data/')
df.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-03-07 16:48:31,@VincentZahler @Mike_Grieco @GovRonDeSantis @R...,gave one example happening another parent's bi...
1,2022-03-07 16:47:21,@VincentZahler @Mike_Grieco @GovRonDeSantis @R...,conflating tweet retort bill weaponizes civil ...
2,2022-03-07 16:44:46,@DontSayGayBill is just legalizing lawfare for...,legalizing lawfare freak place like pasco coun...
3,2022-03-07 16:44:35,@SmoreNewsletter you know what isn’t cool? All...,know isnt cool allowing antiteacher group host...
4,2022-03-07 16:38:22,@RyanSarai1 #5G and #QAnon: how #conspiracy th...,theorist steered canada antivaccine trucker pr...


In [4]:
# this is temporary
# assgin random binary label for df
rand_label = np.random.randint(2, size=len(df))
df['label'] = rand_label

print(df.loc[df['label']==0].shape)
print(df.loc[df['label']==1].shape)

df.head()

(12528, 4)
(12448, 4)


Unnamed: 0,date,tweet,cleaned_tweet,label
0,2022-03-07 16:48:31,@VincentZahler @Mike_Grieco @GovRonDeSantis @R...,gave one example happening another parent's bi...,0
1,2022-03-07 16:47:21,@VincentZahler @Mike_Grieco @GovRonDeSantis @R...,conflating tweet retort bill weaponizes civil ...,0
2,2022-03-07 16:44:46,@DontSayGayBill is just legalizing lawfare for...,legalizing lawfare freak place like pasco coun...,0
3,2022-03-07 16:44:35,@SmoreNewsletter you know what isn’t cool? All...,know isnt cool allowing antiteacher group host...,0
4,2022-03-07 16:38:22,@RyanSarai1 #5G and #QAnon: how #conspiracy th...,theorist steered canada antivaccine trucker pr...,1


In [5]:
# split dataset to train test
X = df['cleaned_tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16733,), (8243,), (16733,), (8243,))

In [6]:
# preprocessing https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
# encoder https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [7]:
# build functional model
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

preprocessed_text = bert_preprocess(text_inputs)
encoded_text = bert_encoder(preprocessed_text)

layer = tf.keras.layers.Dropout(.1, name='dropout')(encoded_text['pooled_output'])
layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(layer)

model = tf.keras.Model(inputs=[text_inputs], outputs=[layer])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [8]:
log_dir = 'model/log'
checkpoint_filepath = 'model/checkpoint'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [9]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [13]:
history = model.fit(X_train,
                    y_train,
                    batch_size=32,
                    epochs=5,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
