In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

from utilize import get_tidyTweet, remove_punct

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hayden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hayden/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path = 'data/'
files = os.listdir(path)
df = pd.concat([pd.read_csv(path+f) for f in files], ignore_index=True)
df = df.drop_duplicates()
df = df[df['language']=='en']
df = df.reset_index(drop=True)

In [3]:
# clean data
df2 = df.copy()
df2 = df2[['date','tweet']]
df2['cleaned_tweet'] = df['tweet'].apply(get_tidyTweet).str.lower()
df2['cleaned_tweet'] = df2['cleaned_tweet'].apply(remove_punct).str.strip()

df2.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-02-24 03:21:12,@disclosetv Those who do not know their histor...,those who do not know their history are open t...
1,2022-02-10 11:39:50,World population = 7.9 billion World covid dea...,world population 79 billion world covid death...
2,2022-02-10 04:49:56,How's the pandemic going in your country these...,how's the pandemic going in your country these...
3,2022-01-27 14:27:34,Very smart Virus. Only attacks you in a night ...,very smart virus only attacks you in a night c...
4,2022-01-18 18:27:11,@PeterSweden7 They are all actors in #plandemi...,they are all actors in each reading and ac...


In [4]:
# create random label for dataset
rand_label = np.random.randint(2, size=len(df2))

# assign random ralel to df
df2['label'] = rand_label

print(df2.head(5))
print()
print(df2.loc[df2['label']==0].shape)
print(df2.loc[df2['label']==1].shape)

                  date                                              tweet  \
0  2022-02-24 03:21:12  @disclosetv Those who do not know their histor...   
1  2022-02-10 11:39:50  World population = 7.9 billion World covid dea...   
2  2022-02-10 04:49:56  How's the pandemic going in your country these...   
3  2022-01-27 14:27:34  Very smart Virus. Only attacks you in a night ...   
4  2022-01-18 18:27:11  @PeterSweden7 They are all actors in #plandemi...   

                                       cleaned_tweet  label  
0  those who do not know their history are open t...      0  
1  world population  79 billion world covid death...      1  
2  how's the pandemic going in your country these...      1  
3  very smart virus only attacks you in a night c...      0  
4  they are all actors in     each reading and ac...      1  

(12468, 4)
(12508, 4)


In [5]:
X = df2['cleaned_tweet'].values
y = df2['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16733,)
(8243,)
(16733,)
(8243,)


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
# find max sequence length
max_seq_length = 0
for item in X:
    max_seq_length = max(max_seq_length, len(tokenizer.tokenize(item)))
print(max_seq_length)

72


In [8]:
def gen_inputs_tensor(X, max_seq_length):
    input_ids = []
    attention_masks = []

    for item in X:
        encoded_dict = tokenizer.encode_plus(
            item,
            add_special_tokens=True,    # add [CLS] 101 & [SEP] 102
            max_length=max_seq_length+2,
            # pad_to_max_length=True,
            padding='max_length',       # add 0 up to max_seq_length
            return_attention_mask=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # convert to tensor
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)

    return input_ids, attention_masks

In [9]:
X_train_input, X_train_mask = gen_inputs_tensor(X_train, max_seq_length)
X_test_input, X_test_mask = gen_inputs_tensor(X_test, max_seq_length)

y_train_tensor = tf.convert_to_tensor(y_train)
y_test_tensor = tf.convert_to_tensor(y_test)

print(X_train_input.shape)
print(X_train_mask.shape)

print(X_test_input.shape)
print(X_test_mask.shape)

print(y_train_tensor.shape)
print(y_test_tensor.shape)

(16733, 74)
(16733, 74)
(8243, 74)
(8243, 74)
(16733,)
(8243,)


In [10]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
log_dir = 'model/log'
path_save_model = 'model/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=path_save_model, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [12]:
print(bert_model.summary())

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

In [14]:
bert_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [15]:
history = bert_model.fit(
    [X_train_input, X_train_mask],
    y_train_tensor,
    batch_size=32,
    epochs=1,
    validation_data=([X_test_input, X_test_mask], y_test_tensor),
    callbacks=callbacks
)

 10/523 [..............................] - ETA: 1:48:12 - loss: 0.6989 - accuracy: 0.5219

KeyboardInterrupt: 