In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import random
from treelib import Node, Tree
import networkx as nx

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split

import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import gensim
from gensim.models import KeyedVectors

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, trange, tqdm_notebook

from IPython.core.debugger import set_trace

In [None]:
args = {
    'data_dir': '../rumor_detection_acl2017/twitter15/',
    'tweet_content_file': 'tweet_contents.txt',
    'tree_dir': 'tree',
    'label_file': 'label.txt',
    'w2v': 'twitter_preprocess_3_w2c_400.txt',

    'max_graph_size': 50,
    'K': 2,
    'hidden_dim': 200,
    'target_size': 4,
    'batch_size': 8,
    'learning_rate': 1e-3,
    'n_epoches': 70,
    'logging_steps': 100,
    'do_eval': True,
    'aggregator': 'mean',
    'n_splits': 5,
    'seed': 1234,
}

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(args['seed'])

## Data

In [None]:
def create_text_processor():
    text_processor = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'date', 'number'],
            fix_html=True,
            segmenter="twitter",
            corrector="twitter",

            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=True,

            # tokenizer=SocialTokenizer(lowercase=True).tokenize,
            tokenizer=RegexpTokenizer(r'\w+').tokenize,

            dicts=[emoticons]
        )

    return text_processor

def remove_stopword(tokens):
    stop_words = stopwords.words('english')
#     stop_words.append('url')
    return [word for word in tokens if word not in stop_words]

def stemming(tokens, ps):
    tokens = [ps.stem(token) for token in tokens]
    return tokens

def lemmatizer(tokens, wn):
    tokens = [wn.lemmatize(token) for token in tokens]
    return tokens

def remove_last_url(tokens):
    if len(tokens) > 0 and tokens[-1] == 'url':
        return tokens[:-1]
    else:
        return tokens
    
def pre_process(s):
    text = s.content
    text = text.replace("\/", '/')
    text = text.lower()

    tokens = text_processor.pre_process_doc(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens, ps)
    tokens = lemmatizer(tokens, wn)
    # tokens = remove_last_url(tokens)
    n_grams = set.union(set(ngrams(tokens, 1)), set(ngrams(tokens, 2)))
    return n_grams

In [None]:
word_vectors = KeyedVectors.load_word2vec_format(args['data_dir'] + args['w2v'], binary=False)
embed_dim = word_vectors.vector_size
text_processor = create_text_processor()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def load_tweet_content(tweet_content_file):
    def embed_content(s):
        tokens = s.content_tokens
        content_embedding = torch.tensor([word_vectors[token] for token in tokens if token in word_vectors], dtype=torch.float)
        content_embedding = torch.mean(content_embedding, axis=0)
        if torch.isnan(content_embedding).any():
            content_embedding = torch.zeros((embed_dim, ))
        return content_embedding

    content_df = pd.read_csv(tweet_content_file, sep='\t', header=None, names=['id', 'content'])
    content_df['content_tokens'] = content_df.apply(pre_process, axis=1)
    content_df['content_embedding'] = content_df.apply(embed_content, axis=1)
    content_dict = {row['id']:row['content_embedding'] for i, row in content_df.iterrows()}
    
    return content_dict

In [None]:
def load_rumor_trees(tree_dir_path, content_dict):
    trees = {}
    for f in os.listdir(tree_dir_path):
        file_path = os.path.join(tree_dir_path, f)

        if os.path.isfile(file_path) and '.txt' in file_path:
            tree = Tree()
            tweet_ids = []
            root_id = int(f.split('.')[0])
            tweet_ids.append(root_id)
            if root_id in content_dict:
                content = content_dict[root_id]
            else:
                content = torch.zeros((embed_dim, ))
            tree.create_node(tag=root_id, identifier=root_id, data=content)
            with open(file_path, 'r') as file:
                for line in file:
                    line_arr = line.split("'")
                    if 'ROOT' not in line:
                        user1 = int(line_arr[1])
                        tweet1 = int(line_arr[3])
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        
                        if tweet2 not in tweet_ids: 
                            tweet_ids.append(tweet2)
                        
                        if tweet2 not in tree.nodes:
                            if tweet2 in content_dict:
                                content = content_dict[tweet2]
                            else:
                                content = torch.zeros((embed_dim, ))
                            tree.create_node(tag=tweet2, identifier=tweet2, parent=tweet1, data=content)
                
                tweet_ids.reverse()
                trees[root_id] = (tweet_ids, tree)
        
    return trees

In [None]:
def load_labels(label_file):
    label_df = pd.read_csv(label_file, sep=':', header=None, names=['label', 'id'])
    label_df['label'] = label_df['label'].map({'unverified': 0, 'non-rumor': 1, 'true': 2, 'false': 3})
    label_dict = {row['id']:row['label'] for i, row in label_df.iterrows()}
    
    return label_dict

## Model

In [None]:
class RumorDataset(Dataset):
    def __init__(self, ids_list, tree_list, label_list):
        super(RumorDataset, self).__init__()
        self.ids_list = ids_list
        self.tree_list = tree_list
        self.label_list = label_list

    def __getitem__(self, item):
        return (self.ids_list[item], self.tree_list[item], self.label_list[item])

    def __len__(self):
        return len(self.label_list)

In [None]:
class RumorModel(nn.Module):
    def __init__(self, embed_dim, hidden_dim, target_size):
        super(RumorModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        self.gru = nn.GRUCell(embed_dim, hidden_dim, bias=False)
        self.linear_out = nn.Linear(hidden_dim, target_size)
        
    def forward(self, node_list, tree):
        node_out_dict = {}
        for node in node_list:
            node_input = tree.get_node(node).data
            node_hidden = torch.zeros((self.hidden_dim, ))
            childrens = tree.children(node)
            for child in childrens:
                node_hidden += node_out_dict[child.identifier]
                
            node_out_dict[node] = self.gru(node_input.unsqueeze(0), node_hidden.unsqueeze(0)).squeeze()
            
        last_node_hidden = node_out_dict[node_list[-1]]
        output = self.linear_out(last_node_hidden)
        return output

## Train

In [None]:
content_dict = load_tweet_content(os.path.join(args['data_dir'], args['tweet_content_file']))
trees = load_rumor_trees(os.path.join(args['data_dir'], args['tree_dir']), content_dict)
label_dict = load_labels(os.path.join(args['data_dir'], args['label_file']))
ids_list = []
tree_list = []
label_list = []

for root, (tweet_ids, tree) in trees.items():
    ids_list.append(tweet_ids)
    tree_list.append(tree)
    label_list.append(label_dict[root])

In [None]:
splits = list(StratifiedKFold(n_splits=args['n_splits'], shuffle=True, random_state=args['seed']).split(tree_list, label_list))

for idx, (train_idx, val_idx) in enumerate(splits):
    print('Train Fold {}'.format(idx))
    
    train_ids_list = [ids_list[i] for i in train_idx]
    train_tree_list = [tree_list[i] for i in train_idx]
    train_label_list = [label_list[i] for i in train_idx]
    
    valid_ids_list = [ids_list[i] for i in val_idx]
    valid_tree_list = [tree_list[i] for i in val_idx]
    valid_label_list = [label_list[i] for i in val_idx]
    
    model = RumorModel(embed_dim=embed_dim, hidden_dim=args['hidden_dim'], 
                       target_size=args['target_size'])
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=args['learning_rate'])

    for epoch in trange(args['n_epoches'], desc='Epoch'):
        model.train()
        tr_loss = 0.0

        for tweet_ids, tree, label in tqdm_notebook(zip(train_ids_list, train_tree_list, train_label_list)):
            preds = model(tweet_ids, tree)
            loss = criterion(preds.unsqueeze(0), torch.tensor(label).unsqueeze(0))

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            tr_loss += loss.item()

        train_loss = tr_loss / len(label_list)
        print(f"Epoch {epoch}, train loss {train_loss}")