In [53]:
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.tokenize import TweetTokenizer
import re
import math
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense,Conv1D,MaxPooling1D
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [2]:
x_train_data, x_dev_data, x_test_data = [], [], []
y_train_data, y_dev_data = [], []

In [5]:
stop_words = set(stopwords.words('english'))

In [3]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

In [6]:
def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)

    word_list = [w for w in text.split() if w not in stop_words]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    if text_clean != '':
        return text_clean
    return ''

In [10]:
with open('./project-data/tweet-train-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        # print(type(text_event))
        x_train_data.append(text_event)    

with open('./project-data/train.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    # print(type(label_all[1][:-1]), label_all[1][:-1])
    for label in label_all:
        if label[:-1] == 'rumour':
            y_train_data.append(1)
        else:
            y_train_data.append(0)

with open('./project-data/tweet-dev-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_dev_data.append(text_event)
        
with open('./project-data/dev.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    for label in label_all:
        if label[:-1] == 'rumour':
            y_dev_data.append(1)
        else:
            y_dev_data.append(0)

In [26]:
with open('./project-data/test.data.txt', 'r', encoding='utf-8') as f:
    id_all = f.readlines()
    for i in range(len(id_all)):
        ids = id_all[i][:-1].split(',')
        text_event = ''
        for j in range(len(ids)):
            file_path = './project-data/tweet-objects/' + ids[j] + '.json'
            with open(file_path, 'r', encoding='utf-8') as f2:
                tweet = json.load(f2)
                text_event += tweet['text']
        x_test_data.append(text_event)

In [None]:
# reading given tsv file
with open("./project-data/train.tsv", 'r') as tsv_file:
    with open("./project-data/train_data.csv", 'w') as csv_file:
        for line in tsv_file:
            csv_file.write(line)
df_train = pd.read_csv('./project-data/train_data.csv',delimiter=',',encoding='utf-8') 
df_dev = pd.read_csv('./project-data/dev_data.csv',delimiter=',',encoding='utf-8')

In [20]:
# preprocessing
max_words = 5000
max_len = 512
embedding_vecor_length = 512
# X_train, X_test, y_train, y_test = train_test_split(x_train_data, y_train_data, test_size=0.02, random_state=4)

# tokenize words
tokenizer = Tokenizer(num_words=max_words) 
tokenizer.fit_on_texts(x_train_data) 
word_index = tokenizer.word_index

# X_train_sequences = tokenizer.texts_to_sequences(X_train) 
# X_test_sequences = tokenizer.texts_to_sequences(X_test) 

X_train_sequences = tokenizer.texts_to_sequences(x_train_data) 
X_test_sequences = tokenizer.texts_to_sequences(x_dev_data) 


X_train = np.array(X_train_sequences)
X_test = np.array(X_test_sequences)
y_train = np.array(y_train_data)
y_test = np.array(y_dev_data)

X_train_pad = sequence.pad_sequences(X_train, max_len)
X_test_pad = sequence.pad_sequences(X_test, max_len)

In [22]:
model = Sequential()
model.add(Embedding(max_words, embedding_vecor_length, input_length=max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train_pad, y_train, epochs=5, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_3 (LSTM)               (None, 100)               245200    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 2,805,301
Trainable params: 2,805,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'sequential_4/embedding_3/embedding_lookup' defined at (most recent call last):
    File "/usr/local/Cellar/python@3.9/3.9.2_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/local/Cellar/python@3.9/3.9.2_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/usr/local/lib/python3.9/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/usr/local/lib/python3.9/site-packages/traitlets/config/application.py", line 845, in launch_instance
      app.start()
    File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 668, in start
      self.io_loop.start()
    File "/usr/local/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/local/Cellar/python@3.9/3.9.2_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
      self._run_once()
    File "/usr/local/Cellar/python@3.9/3.9.2_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
      handle._run()
    File "/usr/local/Cellar/python@3.9/3.9.2_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 456, in dispatch_queue
      await self.process_one()
    File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 445, in process_one
      await dispatch(*args)
    File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 352, in dispatch_shell
      await result
    File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 647, in execute_request
      reply_content = await reply_content
    File "/usr/local/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 335, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/usr/local/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 532, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
      result = self._run_cell(
    File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
      return runner(coro)
    File "/usr/local/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/np/6wcw_zfs1cvg8nx5694wg55m0000gn/T/ipykernel_4667/989157803.py", line 10, in <module>
      model.fit(X_train_pad, y_train, epochs=5, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 859, in train_step
      y_pred = self(x, training=True)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/sequential.py", line 374, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/functional.py", line 451, in call
      return self._run_internal_graph(
    File "/usr/local/lib/python3.9/site-packages/keras/engine/functional.py", line 589, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/site-packages/keras/layers/embeddings.py", line 197, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential_4/embedding_3/embedding_lookup'
indices[249,466] = 6703 is not in [0, 5000)
	 [[{{node sequential_4/embedding_3/embedding_lookup}}]] [Op:__inference_train_function_16894]

In [31]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,max_len,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.1)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [32]:
model = RNN()
model.summary()
# model.compile(loss='binary_crossentropy',optimizer=optimizers.RMSprop(lr=2e-5),metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train_pad, y_train, epochs=10, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 512, 512)          25600000  
                                                                 
 lstm_4 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_4 (Activation)   (None, 256)               0         
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x12c93f130>

In [16]:
model = RNN()
model.summary()
# model.compile(loss='binary_crossentropy',optimizer=optimizers.RMSprop(lr=2e-5),metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train_pad, y_train, epochs=10, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 512, 512)          25600000  
                                                                 
 lstm_1 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x12bd23670>

In [34]:
scores = model.evaluate(X_test_pad,y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 88.45%


In [59]:
x_test_clean = []
for x_line in  x_test_data:
    x_clean = clean_text(x_line)
    x_test_clean.append(x_clean)



In [61]:

x_test_sequences = tokenizer.texts_to_sequences(x_test_clean)

x_data = np.array(x_test_sequences)

x_data_pad = sequence.pad_sequences(x_data, max_len)

In [62]:

y_pred = model.predict(x_data_pad)

In [63]:
y_pred

array([[3.43769789e-04],
       [4.10810113e-03],
       [8.73565674e-04],
       [5.18548489e-03],
       [2.32696533e-04],
       [9.68241692e-03],
       [4.11680341e-03],
       [9.99896348e-01],
       [1.93968207e-01],
       [8.53377461e-01],
       [9.99990046e-01],
       [3.78409564e-01],
       [1.76239014e-03],
       [8.02881718e-02],
       [1.62856162e-01],
       [8.07473589e-06],
       [3.80039215e-04],
       [2.33501196e-04],
       [5.46579504e-05],
       [2.22547424e-05],
       [9.49257612e-03],
       [3.62992287e-04],
       [1.01499034e-04],
       [9.99370635e-01],
       [6.50075078e-03],
       [1.24394894e-04],
       [3.37150777e-05],
       [3.93938243e-01],
       [5.99564919e-05],
       [8.13020527e-01],
       [1.00032985e-02],
       [9.99990284e-01],
       [9.94672537e-01],
       [6.86019659e-04],
       [6.90858869e-05],
       [5.54626968e-05],
       [1.48361969e-05],
       [1.05062127e-03],
       [2.50697136e-04],
       [7.42820621e-01],


In [64]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [66]:
with open('./project-data/lstm-predict.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    header = ['Id', 'Predicted']
    writer.writerow(header)
    for i in range(len(y_pred)):
        y_pred_final = 0
        y_pred_tmp = sigmoid(y_pred[i])
        if y_pred_tmp > 0.511:
            y_pred_final = 1
        data = [i, y_pred_final]
        writer.writerow(data)