## LSTM trainig
This file is to build a lstm binary model to classify the text is rumor or not

### import library

In [7]:
import numpy as np
import pandas as pd
import json
import csv
import nltk
from nltk.tokenize import TweetTokenizer
import re
import math
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense,Conv1D,MaxPooling1D
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

### get initial datasets

In [8]:
x_train_data, x_dev_data, x_test_data = [], [], []
y_train_data, y_dev_data = [], []

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

In [11]:
def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)

    word_list = [w for w in text.split() if w not in stop_words]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    if text_clean != '':
        return text_clean
    return ''

In [12]:
with open('./project-data/tweet-train-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        # print(type(text_event))
        x_train_data.append(text_event)    

with open('./project-data/train.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    # print(type(label_all[1][:-1]), label_all[1][:-1])
    for label in label_all:
        if label[:-1] == 'rumour':
            y_train_data.append(1)
        else:
            y_train_data.append(0)

with open('./project-data/tweet-dev-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_dev_data.append(text_event)
        
with open('./project-data/dev.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    for label in label_all:
        if label[:-1] == 'rumour':
            y_dev_data.append(1)
        else:
            y_dev_data.append(0)

In [13]:
with open('./project-data/test.data.txt', 'r', encoding='utf-8') as f:
    id_all = f.readlines()
    for i in range(len(id_all)):
        ids = id_all[i][:-1].split(',')
        text_event = ''
        for j in range(len(ids)):
            file_path = './project-data/tweet-objects/' + ids[j] + '.json'
            with open(file_path, 'r', encoding='utf-8') as f2:
                tweet = json.load(f2)
                text_event += tweet['text']
        x_test_data.append(text_event)

In [None]:
# reading given tsv file
with open("./project-data/train.tsv", 'r') as tsv_file:
    with open("./project-data/train_data.csv", 'w') as csv_file:
        for line in tsv_file:
            csv_file.write(line)
df_train = pd.read_csv('./project-data/train_data.csv',delimiter=',',encoding='utf-8') 
df_dev = pd.read_csv('./project-data/dev_data.csv',delimiter=',',encoding='utf-8')

### text preprocessing

In [14]:
max_words = 5000
max_len = 512
embedding_vecor_length = 512
# X_train, X_test, y_train, y_test = train_test_split(x_train_data, y_train_data, test_size=0.02, random_state=4)

# tokenize words
tokenizer = Tokenizer(num_words=max_words) 
tokenizer.fit_on_texts(x_train_data) 
word_index = tokenizer.word_index

# X_train_sequences = tokenizer.texts_to_sequences(X_train) 
# X_test_sequences = tokenizer.texts_to_sequences(X_test) 

X_train_sequences = tokenizer.texts_to_sequences(x_train_data) 
X_test_sequences = tokenizer.texts_to_sequences(x_dev_data) 


X_train = np.array(X_train_sequences)
X_test = np.array(X_test_sequences)
y_train = np.array(y_train_data)
y_test = np.array(y_dev_data)

X_train_pad = sequence.pad_sequences(X_train, max_len)
X_test_pad = sequence.pad_sequences(X_test, max_len)

### build model

In [15]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,max_len,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.1)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [19]:

model.fit(X_train_pad,y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_2 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_4 (Activation)   (None, 256)               0         
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

NameError: name 'EarlyStopping' is not defined

In [20]:
model = RNN()
model.summary()
# model.compile(loss='binary_crossentropy',optimizer=optimizers.RMSprop(lr=2e-5),metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train_pad, y_train, epochs=10, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_3 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_6 (Activation)   (None, 256)               0         
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x139f589a0>

In [22]:
model2 = RNN()
model2.summary()
# model.compile(loss='binary_crossentropy',optimizer=optimizers.RMSprop(lr=2e-5),metrics=['accuracy'])
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model2.fit(X_train_pad, y_train, epochs=10, batch_size=128,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_5 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_10 (Activation)  (None, 256)               0         
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x13ae19880>

In [24]:
model2 = RNN()
model2.summary()
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model2.fit(X_train_pad, y_train, epochs=10, batch_size=256,verbose = 2,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_7 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_7 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_14 (Activation)  (None, 256)               0         
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x13b8c4f70>

In [26]:
model2 = RNN()
model2.summary()
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
filepath="weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model2.fit(X_train_pad, y_train, epochs=10, batch_size=256,verbose = 3,callbacks = callbacks_list,validation_data=(X_test_pad,y_test))

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 512)]             0         
                                                                 
 embedding_9 (Embedding)     (None, 512, 512)          2560000   
                                                                 
 lstm_9 (LSTM)               (None, 64)                147712    
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_18 (Activation)  (None, 256)               0         
                                                                 
 dropout_9 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.callbacks.History at 0x13c4accd0>

In [34]:
scores = model.evaluate(X_test_pad,y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 88.45%


### predict output

In [61]:

x_test_sequences = tokenizer.texts_to_sequences(x_test_clean)

x_data = np.array(x_test_sequences)

x_data_pad = sequence.pad_sequences(x_data, max_len)

In [62]:

y_pred = model.predict(x_data_pad)

In [64]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [66]:
with open('./project-data/lstm-predict.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    header = ['Id', 'Predicted']
    writer.writerow(header)
    for i in range(len(y_pred)):
        y_pred_final = 0
        y_pred_tmp = sigmoid(y_pred[i])
        if y_pred_tmp > 0.511:
            y_pred_final = 1
        data = [i, y_pred_final]
        writer.writerow(data)