In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split

from utilize import gen_dataframe

[nltk_data] Downloading package omw-1.4 to /Users/hayden/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hayden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hayden/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
physical_devices = tf.config.list_physical_devices("GPU")
physical_devices

[]

In [3]:
df = gen_dataframe('data/')
df.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-02-24 03:21:12,@disclosetv Those who do not know their histor...,know history open repeating two year humanity ...
1,2022-02-10 11:39:50,World population = 7.9 billion World covid dea...,world population billion world covid death mil...
2,2022-02-10 04:49:56,How's the pandemic going in your country these...,how's pandemic going country day still looking...
3,2022-01-27 14:27:34,Very smart Virus. Only attacks you in a night ...,smart virus attack night club dancing standing...
4,2022-01-18 18:27:11,@PeterSweden7 They are all actors in #plandemi...,actor reading acting according script assigned...


In [4]:
# this is temporary
# assgin random binary label for df
rand_label = np.random.randint(2, size=len(df))
df['label'] = rand_label

print(df.loc[df['label']==0].shape)
print(df.loc[df['label']==1].shape)

df.head()

(12617, 4)
(12359, 4)


Unnamed: 0,date,tweet,cleaned_tweet,label
0,2022-02-24 03:21:12,@disclosetv Those who do not know their histor...,know history open repeating two year humanity ...,1
1,2022-02-10 11:39:50,World population = 7.9 billion World covid dea...,world population billion world covid death mil...,0
2,2022-02-10 04:49:56,How's the pandemic going in your country these...,how's pandemic going country day still looking...,1
3,2022-01-27 14:27:34,Very smart Virus. Only attacks you in a night ...,smart virus attack night club dancing standing...,1
4,2022-01-18 18:27:11,@PeterSweden7 They are all actors in #plandemi...,actor reading acting according script assigned...,0


In [6]:
# split dataset to train test
X = df['cleaned_tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16733,), (8243,), (16733,), (8243,))

In [9]:
# preprocessing https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
# encoder https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [10]:
# build functional model
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

preprocessed_text = bert_preprocess(text_inputs)
encoded_text = bert_encoder(preprocessed_text)

layer = tf.keras.layers.Dropout(.1, name='dropout')(encoded_text['pooled_output'])
layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(layer)

model = tf.keras.Model(inputs=[text_inputs], outputs=[layer])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
log_dir = 'model/log'
checkpoint_filepath = 'model/checkpoint'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [None]:
history = model.fit(X_train,
                    y_train,
                    batch_size=32,
                    epochs=5,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

In [None]:
# create random label for dataset
rand_label = np.random.randint(2, size=len(df2))

# assign random ralel to df
df2['label'] = rand_label

print(df2.head(5))
print()
print(df2.loc[df2['label']==0].shape)
print(df2.loc[df2['label']==1].shape)


In [None]:
X = df2['cleaned_tweet'].values
y = df2['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# find max sequence length
max_seq_length = 0
for item in X:
    max_seq_length = max(max_seq_length, len(tokenizer.tokenize(item)))
print(max_seq_length)

In [None]:
def gen_inputs_tensor(X, max_seq_length):
    input_ids = []
    attention_masks = []

    for item in X:
        encoded_dict = tokenizer.encode_plus(
            item,
            add_special_tokens=True,    # add [CLS] 101 & [SEP] 102
            max_length=max_seq_length+2,
            # pad_to_max_length=True,
            padding='max_length',       # add 0 up to max_seq_length
            return_attention_mask=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # convert to tensor
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
X_train_input, X_train_mask = gen_inputs_tensor(X_train, max_seq_length)
X_test_input, X_test_mask = gen_inputs_tensor(X_test, max_seq_length)

y_train_tensor = tf.convert_to_tensor(y_train)
y_test_tensor = tf.convert_to_tensor(y_test)

print(X_train_input.shape)
print(X_train_mask.shape)

print(X_test_input.shape)
print(X_test_mask.shape)

print(y_train_tensor.shape)
print(y_test_tensor.shape)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
log_dir = 'model/log'
path_save_model = 'model/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=path_save_model, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [None]:
print(bert_model.summary())

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

In [None]:
bert_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [None]:
history = bert_model.fit(
    [X_train_input, X_train_mask],
    y_train_tensor,
    batch_size=32,
    epochs=1,
    validation_data=([X_test_input, X_test_mask], y_test_tensor),
    callbacks=callbacks
)