In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

from utilize import words_preprocessing
from utilize_model import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path = 'data/'
files = os.listdir(path)
df = pd.concat([pd.read_csv(path+f) for f in files], ignore_index=True)
df = df.drop_duplicates()
df = df[df['language']=='en']
df = df.reset_index(drop=True)

In [3]:
# clean data
df2 = df.copy()
df2 = df2[['date','tweet']]
df2['cleaned_tweet'] = df2['tweet'].apply(words_preprocessing)
df2['cleaned_tweet'] = df2['cleaned_tweet'].apply(join_tokens)

df2.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-03-07 14:40:41,Have you missed the origins of covid debate &a...,missed origin covid debate entirely medicine a...
1,2022-03-07 13:40:12,This is another clue that this virus is not na...,another clue virus natural man made lab leak n...
2,2022-03-07 13:00:06,Shhh! This is an example of how reality can da...,shhh example reality damage approved narrative...
3,2022-03-07 06:59:43,“Hard earned American taxpayer dollars should ...,hard earned american taxpayer dollar goingto l...
4,2022-03-07 06:52:10,"@Shoshin41734407 @OpIndia_com ""two biological ...",two biological warfare lab kiev odessa


In [4]:
# counter of unique words
counter = counter_word(df2['cleaned_tweet'])
# number of unique words in df column
num_unique_words = len(counter)

In [5]:
# this is temporary ~~~~~~~~~~~~~
# assgin random binary label for df
rand_label = np.random.randint(2, size=len(df2))
df2['label'] = rand_label

print(df2.head(5))
print()
print(df2.loc[df2['label']==0].shape)
print(df2.loc[df2['label']==1].shape)


                  date                                              tweet  \
0  2022-03-07 14:40:41  Have you missed the origins of covid debate &a...   
1  2022-03-07 13:40:12  This is another clue that this virus is not na...   
2  2022-03-07 13:00:06  Shhh! This is an example of how reality can da...   
3  2022-03-07 06:59:43  “Hard earned American taxpayer dollars should ...   
4  2022-03-07 06:52:10  @Shoshin41734407 @OpIndia_com "two biological ...   

                                       cleaned_tweet  label  
0  missed origin covid debate entirely medicine a...      1  
1  another clue virus natural man made lab leak n...      0  
2  shhh example reality damage approved narrative...      1  
3  hard earned american taxpayer dollar goingto l...      0  
4             two biological warfare lab kiev odessa      0  

(12527, 4)
(12449, 4)


In [6]:
# split dataset to train test
X = df2['cleaned_tweet'].values
y = df2['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16733,), (8243,), (16733,), (8243,))

In [7]:
# tokenize
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(X_train)

# create word index for each unique word
unique_word_index = tokenizer.word_index

# reverse the word index indices (key, value) -> (value, key)
reverse_unique_word_index = dict([idx, word] for (word, idx) in unique_word_index.items())

In [8]:
# convert to sequence
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

print(X_train[0])
print(X_train_seq[0])

biden considers suspending federal gas tax
[2, 3116, 3871, 253, 139, 308]


In [9]:
# the reverse unique word index is used to decode a sequence to original text. example:
print(X_train_seq[0])
print(decode_seq(X_train_seq[0], reverse_unique_word_index))

[2, 3116, 3871, 253, 139, 308]
biden considers suspending federal gas tax


In [10]:
#find max len of sequences
max_seq_len = 0
for seq in X_train_seq:
    max_seq_len = max(max_seq_len, len(seq))
print(max_seq_len)

37


In [11]:
# add padding to sequence. padding - fill up with zeros to a sequence
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_len, padding='post', truncating='post')

X_train_padded.shape, X_test_padded.shape

((16733, 37), (8243, 37))

In [12]:
# LSTM model
# https://www.tensorflow.org/text/guide/word_embeddings
# Embedding
model = tf.keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_seq_len))
model.add(layers.LSTM(64, dropout=.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 37, 32)            720096    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 744,993
Trainable params: 744,993
Non-trainable params: 0
_________________________________________________________________


In [13]:
log_dir = 'model/log'
model_save_path = 'model/model_lstm.pt'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, 
                                                save_weights_only=True, 
                                                monitor='val_loss', 
                                                mode='min', 
                                                save_best_only=True), 
            tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

In [14]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

In [15]:
training_history = model.fit(X_train_padded,
                            y_train,
                            batch_size=32,
                            epochs=20,
                            validation_data=(X_test_padded, y_test),
                            verbose=2,
                            callbacks=callbacks)

Epoch 1/20
523/523 - 7s - loss: 0.6931 - accuracy: 0.5024 - val_loss: 0.6932 - val_accuracy: 0.4967 - 7s/epoch - 13ms/step
Epoch 2/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5045 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 3/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5039 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 4/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5039 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 5/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5039 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 6/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5040 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 7/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5039 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 8/20
523/523 - 3s - loss: 0.6931 - accuracy: 0.5040 - val_loss: 0.6932 - val_accuracy: 0.4967 - 3s/epoch - 6ms/step
Epoch 9/20
523/523 - 3s

In [16]:
# build pre-trained BERT model