In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import random
from treelib import Node, Tree
import networkx as nx

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split

import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import gensim
from gensim.models import KeyedVectors

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from tqdm import tqdm, trange, tqdm_notebook

from plotly.offline import init_notebook_mode, iplot
import chart_studio.plotly as py
import plotly.graph_objects as go

init_notebook_mode(connected=True)
from IPython.core.debugger import set_trace

In [None]:
args = {
    'dataset': 'PHEME', #Twitter/PHEME
#     'data_dir': '../rumor_detection_acl2017/twitter_all/', #Twitter
    'data_dir': '../pheme_twitter/', #PHEME
    'tweet_content_file': 'tweet_contents_merge_fix.txt',
    'tree_dir': 'tree',
    'label_file': 'label.txt',
    'user_info_file': 'user_info.txt',
    'w2v': 'twitter_preprocess_4_w2c_400.txt',

    'max_graph_size': 50,
    'K': 2,
    'hidden_dim': 50,
    'target_size': 2, # 4/2
    'batch_size': 1,
    'learning_rate': 2e-3,
    'n_epoches': 40, # 70/40
    'logging_steps': 100,
    'do_eval': True,
    'n_splits': 5,
    'seed': 1234,
}

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(args['seed'])

## Data

In [None]:
def create_text_processor():
    text_processor = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'date', 'number'],
            fix_html=True,
            segmenter="twitter",
            corrector="twitter",

            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=True,

            # tokenizer=SocialTokenizer(lowercase=True).tokenize,
            tokenizer=RegexpTokenizer(r'\w+').tokenize,

            dicts=[emoticons]
        )

    return text_processor

def remove_stopword(tokens):
    stop_words = stopwords.words('english')
#     stop_words.append('url')
    return [word for word in tokens if word not in stop_words]

def stemming(tokens, ps):
    tokens = [ps.stem(token) for token in tokens]
    return tokens

def lemmatizer(tokens, wn):
    tokens = [wn.lemmatize(token) for token in tokens]
    return tokens

def remove_last_url(tokens):
    if len(tokens) > 0 and tokens[-1] == 'url':
        return tokens[:-1]
    else:
        return tokens
    
def pre_process(s):
    text = s.content
    text = text.replace("\/", '/')
    text = text.lower()

    tokens = text_processor.pre_process_doc(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens, ps)
    tokens = lemmatizer(tokens, wn)
    # tokens = remove_last_url(tokens)
    return tokens

In [None]:
word_vectors = KeyedVectors.load_word2vec_format(args['data_dir'] + args['w2v'], binary=False)
embed_dim = word_vectors.vector_size
text_processor = create_text_processor()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def extract_content_features(s):
    text = s.content
    text_len = len(text)
    capital_ratio = sum(1 for c in text if c.isupper()) / text_len
    question_marks = 1 if '?' in text else 0
    exclamation_marks = 1 if '!' in text else 0
    period_marks = 1 if '.' in text else 0
#     word_count = len(text.split())
    
    return torch.tensor([capital_ratio, question_marks, exclamation_marks, period_marks])

def load_tweet_content(tweet_content_file):
    def embed_content(s):
        tokens = s.content_tokens
        content_embedding = torch.tensor([word_vectors[token] for token in tokens if token in word_vectors], dtype=torch.float)
        content_embedding = torch.mean(content_embedding, axis=0)
        if torch.isnan(content_embedding).any():
            content_embedding = torch.zeros((embed_dim, ))
        return content_embedding

    content_df = pd.read_csv(tweet_content_file, sep='\t', header=None, names=['id', 'content'], lineterminator='\n')
    content_df['content_features'] = content_df.apply(extract_content_features, axis=1)
    content_df['word_count'] = content_df['content'].apply(lambda x: len(x.split()))
    min_max_scaler = preprocessing.MinMaxScaler()
    content_df['word_count'] = min_max_scaler.fit_transform(content_df[['word_count']].values.astype(float))
    
    content_df['content_tokens'] = content_df.apply(pre_process, axis=1)
    content_df['content_embedding'] = content_df.apply(embed_content, axis=1)
    content_dict = {row['id']:torch.cat((row['content_embedding'], row['content_features'], torch.tensor([row['word_count']]))) for i, row in content_df.iterrows()}
    
    return content_dict

In [None]:
def load_user_info(user_info_file):
    user_info_dict = {}
    user_df = pd.read_csv(user_info_file, sep='\t')
    user_df = user_df.drop('has_description', 1)
    user_df = user_df.drop('has_url', 1)

    min_max_scaler = preprocessing.MinMaxScaler()
    user_df['account_age'] = min_max_scaler.fit_transform(user_df[['account_age']].values.astype(float))
    user_df['status_count'] = min_max_scaler.fit_transform(user_df[['status_count']].values.astype(float))
    user_df['follower_count'] = min_max_scaler.fit_transform(user_df[['follower_count']].values.astype(float))
    user_df['friend_count'] = min_max_scaler.fit_transform(user_df[['friend_count']].values.astype(float))
    
    user_info_dict = {row['uid']: torch.tensor(row.values[1:], dtype=torch.float32) for i, row in user_df.iterrows()}
    return user_info_dict

In [None]:
def load_rumor_trees(tree_dir_path, avai_ids):
    trees = {}
    for f in os.listdir(tree_dir_path):
        file_path = os.path.join(tree_dir_path, f)

        if os.path.isfile(file_path) and '.txt' in file_path:
            tree = Tree()
            tweet_ids = []
            root_id = int(f.split('.')[0])
            if root_id not in avai_ids:
                continue
            tweet_ids.append(root_id)
                
            tree.create_node(tag=root_id, identifier=root_id)
            with open(file_path, 'r') as file:
                for line in file:
                    line_arr = line.split("'")
                    if 'ROOT' in line:
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        tweet2_time = float(line_arr[11])
                        
                        tree.get_node(tweet2).data=(user2, 0.0)
                    elif 'ROOT' not in line:
                        user1 = int(line_arr[1])
                        tweet1 = int(line_arr[3])
                        tweet1_time = float(line_arr[5])
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        tweet2_time = float(line_arr[11])
                        
                        if tweet2 not in tweet_ids and tweet1 in tweet_ids: 
                            tweet_ids.append(tweet2)
                        
                        if tweet2 not in tree.nodes and tweet1 in tree.nodes:
                            tree.create_node(tag=tweet2, identifier=tweet2, parent=tweet1, data=(user2, tweet2_time))
                            
#                 tweet_ids.reverse()
                trees[root_id] = (tweet_ids, tree)
        
    return trees

In [None]:
def load_labels(label_file):
    label_df = pd.read_csv(label_file, sep=':', header=None, names=['label', 'id'])
    if args['dataset'] == 'Twitter':
        label_df['label'] = label_df['label'].map({'unverified': 0, 'non-rumor': 1, 'true': 2, 'false': 3})
    elif args['dataset'] == 'PHEME':
        label_df['label'] = label_df['label'].map({'rumor': 0, 'non-rumor': 1})
    label_dict = {row['id']:row['label'] for i, row in label_df.iterrows()}
    
    return label_dict

## Model

In [None]:
class RumorDataset(Dataset):
    def __init__(self, adj_matrix_list, tweet_feature_list, label_list):
        super(RumorDataset, self).__init__()
        self.adj_matrix_list = adj_matrix_list
        self.tweet_feature_list = tweet_feature_list
        self.label_list = torch.tensor(label_list, dtype=torch.long)

    def __getitem__(self, item):
        return (self.adj_matrix_list[item], self.tweet_feature_list[item], self.label_list[item])

    def __len__(self):
        return len(self.label_list)

In [None]:
class RumorModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, target_size):
        super(RumorModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.gru = nn.GRUCell(input_dim, hidden_dim, bias=False)
        self.linear_out = nn.Linear(hidden_dim, target_size)
        
    def forward(self, adj, feature):
        hidden_feature = torch.zeros((adj.shape[0], adj.shape[1], self.hidden_dim), device='cuda', dtype=torch.float)
        for i in range(adj.shape[1]):
            node_adj = adj[:, i, :]
            node_hidden_input = torch.bmm(node_adj.unsqueeze(1), hidden_feature)
            node_hidden = self.gru(feature[:, i, :], node_hidden_input.squeeze(1))
            hidden_feature[:, i, :] = node_hidden
            
        output = self.linear_out(node_hidden)
#         output = F.softmax(output)
        return output

## Train

In [None]:
content_dict = load_tweet_content(os.path.join(args['data_dir'], args['tweet_content_file']))
user_info_dict = load_user_info(os.path.join(args['data_dir'], args['user_info_file']))
trees = load_rumor_trees(os.path.join(args['data_dir'], args['tree_dir']), avai_ids=content_dict.keys())
label_dict = load_labels(os.path.join(args['data_dir'], args['label_file']))

extra_content_dim = 5
user_dim = 5
time_dim = 1
input_dim = embed_dim + extra_content_dim + user_dim + time_dim

In [None]:
max_time = 0
for root_id, (tweet_ids, tree) in trees.items():
    tweet_ids = tweet_ids[:args['max_graph_size']]
#     print('root', tree.root)
    for tweet in tweet_ids:
        if tweet in tree.nodes.keys():
#             print(tree.get_node(tweet).data[1])
            tweet_time = tree.get_node(tweet).data[1]
            if max_time < tweet_time:
                max_time = tweet_time
        else:
            print(tweet)

In [None]:
adj_matrix_list = []
tweet_feature_list = []
label_list = []

for root, (tweet_ids, tree) in trees.items():
    tweet_ids = tweet_ids[:args['max_graph_size']]
    tweet_ids.reverse()
    
    tweet_feature = torch.zeros((len(tweet_ids), input_dim))
    adj_matrix = torch.zeros((len(tweet_ids), len(tweet_ids)))
    
    for i, tweet in enumerate(tweet_ids):
#         tweet_feature[i] = content_dict[tweet] if tweet in content_dict else torch.zeros((embed_dim, ))
        
        content_feature = content_dict[tweet] if tweet in content_dict else torch.zeros((embed_dim + extra_content_dim, ))
        user_id = tree.get_node(tweet).data[0]
        social_feature = user_info_dict[user_id] if user_id in user_info_dict else torch.zeros((user_dim))
        time_feature = torch.tensor([tree.get_node(tweet).data[1]/max_time])
        tweet_feature[i] = torch.cat((content_feature, social_feature, time_feature))
        
        for child in tree.children(tweet):
            if child.identifier in tweet_ids:
                j = tweet_ids.index(child.identifier)
                adj_matrix[i, j] = 1
    
    tweet_feature_list.append(tweet_feature)
    adj_matrix_list.append(adj_matrix)
    label_list.append(label_dict[root])
    
# tweet_feature_list = torch.tensor(tweet_feature_list, dtype=torch.float)
# adj_matrix_list = torch.tensor(adj_matrix_list, dtype=torch.long)
# label_list = torch.tensor(label_list, dtype=torch.long)

In [None]:
splits = list(StratifiedKFold(n_splits=args['n_splits'], shuffle=True, random_state=args['seed']).split(tweet_feature_list, label_list))
fold_train_acc = np.zeros((args['n_splits'], args['n_epoches']))
fold_train_loss = np.zeros((args['n_splits'], args['n_epoches']))
fold_val_acc = np.zeros((args['n_splits'], args['n_epoches']))
fold_val_loss = np.zeros((args['n_splits'], args['n_epoches']))

fold_unverified_f1 = np.zeros((args['n_splits'], args['n_epoches']))
fold_non_f1 = np.zeros((args['n_splits'], args['n_epoches']))
fold_true_f1 = np.zeros((args['n_splits'], args['n_epoches']))
fold_false_f1 = np.zeros((args['n_splits'], args['n_epoches']))
fold_macro_f1 = np.zeros((args['n_splits'], args['n_epoches']))

for idx, (train_idx, val_idx) in enumerate(splits):
    print('Train Fold {}'.format(idx))
    
    train_dataset = RumorDataset([adj_matrix_list[i] for i in train_idx], 
                                 [tweet_feature_list[i] for i in train_idx], 
                                 [label_list[i] for i in train_idx])
    train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, num_workers=8)

    valid_dataset = RumorDataset([adj_matrix_list[i] for i in val_idx], 
                                 [tweet_feature_list[i] for i in val_idx], 
                                 [label_list[i] for i in val_idx])
    valid_loader = DataLoader(valid_dataset, batch_size=args['batch_size'], shuffle=False, num_workers=8)
    
    model = RumorModel(input_dim=input_dim, hidden_dim=args['hidden_dim'], 
                       target_size=args['target_size']).cuda()
    criterion = torch.nn.CrossEntropyLoss()
#     criterion = torch.nn.MSELoss()
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=args['learning_rate'])
#     optimizer = torch.optim.Adagrad(params=model.parameters(), lr=args['learning_rate'])

    model.zero_grad()
    for epoch in trange(args['n_epoches'], desc='Epoch'):
        model.train()
        tr_loss = 0.0
        
        pred_labels = None
        true_labels = np.array([])
        for index, (adj, feature, label) in enumerate(tqdm_notebook(train_loader, desc='Batch')):
            adj = adj.cuda()
            feature = feature.cuda()
            label = label.cuda()
            
            preds = model(adj, feature)
#             new_label = torch.zeros((preds.shape)).cuda()
#             new_label[0, label[0]] = 1.0
            loss = criterion(preds, label)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            tr_loss += loss.item()
            
            true_labels = np.append(true_labels, label.cpu().numpy(), axis=0)
            if pred_labels is None:
                pred_labels = preds.detach().cpu().numpy()
            else:
                pred_labels = np.append(pred_labels, preds.detach().cpu().numpy(), axis=0)

        pred_labels = np.argmax(pred_labels, axis=1)
        train_acc = accuracy_score(true_labels, pred_labels)
        train_loss = tr_loss / len(train_loader)
        fold_train_acc[idx, epoch] = train_acc
        fold_train_loss[idx, epoch] = train_loss
        print(f"Epoch {epoch}, train loss {train_loss}, train accuracy {train_acc}")
        
        model.eval()
        vl_loss = 0.0
        pred_labels = None
        true_labels = np.array([])
        for index, (adj, feature, label) in enumerate(tqdm_notebook(valid_loader, desc='Valid Batch')):
            adj = adj.cuda()
            feature = feature.cuda()
            label = label.cuda()

            preds = model(adj, feature)
#             new_label = torch.zeros((preds.shape)).cuda()
#             new_label[0, label[0]] = 1.0
            loss = criterion(preds, label)

            vl_loss += loss.item()
            true_labels = np.append(true_labels, label.cpu().numpy(), axis=0)
            if pred_labels is None:
                pred_labels = preds.detach().cpu().numpy()
            else:
                pred_labels = np.append(pred_labels, preds.detach().cpu().numpy(), axis=0)

        pred_labels = np.argmax(pred_labels, axis=1)
        val_acc = accuracy_score(true_labels, pred_labels)
        val_loss = vl_loss / len(valid_loader)
        
        if args['dataset'] == 'Twitter':
            f1 = f1_score(true_labels, pred_labels, labels=[0, 1, 2, 3], average=None)
            macro_f1 = f1_score(true_labels, pred_labels, average='macro')
            
            fold_unverified_f1[idx, epoch] = f1[0]
            fold_non_f1[idx, epoch] = f1[1]
            fold_true_f1[idx, epoch] = f1[2]
            fold_false_f1[idx, epoch] = f1[3]
            fold_macro_f1[idx, epoch] = macro_f1
        elif args['dataset'] == 'PHEME':
            f1 = f1_score(true_labels, pred_labels, pos_label=0)
            precision = precision_score(true_labels, pred_labels, pos_label=0)
            recall = recall_score(true_labels, pred_labels, pos_label=0)
            
            fold_true_f1[idx, epoch] = precision
            fold_non_f1[idx, epoch] = recall
            fold_macro_f1[idx, epoch] = f1
        
        fold_val_acc[idx, epoch] = val_acc
        fold_val_loss[idx, epoch] = val_loss
        print(f"Fold {idx}, Epoch {epoch}, valid loss {val_loss}, valid accuracy {val_acc}", flush=True)

In [None]:
print(max(fold_val_acc.mean(axis=0)))
max_fold = np.where(fold_val_acc.mean(axis=0) == max(fold_val_acc.mean(axis=0)))
print(max_fold)

In [None]:
print('macro f1', fold_macro_f1.mean(axis=0)[max_fold])
print('non-rumor', fold_non_f1.mean(axis=0)[max_fold])
print('true', fold_true_f1.mean(axis=0)[max_fold])
print('false', fold_false_f1.mean(axis=0)[max_fold])
print('unverified', fold_unverified_f1.mean(axis=0)[max_fold])

## Result

In [None]:
def plot_result(info):
    if info=='acc':
        data = [
                    go.Scatter(x=np.arange(args['n_epoches']), y=np.array(fold_train_acc.mean(axis=0)), name='Train'),
                    go.Scatter(x=np.arange(args['n_epoches']), y=np.array(fold_val_acc.mean(axis=0)), name='Valid'),
               ]
        layout = {
                    "title": "Accuracy",
                 }
    else:
        data = [
                    go.Scatter(x=np.arange(args['n_epoches']), y=np.array(fold_train_loss.mean(axis=0)), name='Train'),
                    go.Scatter(x=np.arange(args['n_epoches']), y=np.array(fold_val_loss.mean(axis=0)), name='Valid'),
               ]
        layout = {
                    "title": "Loss",
                 }
        
    fig = go.Figure(data=data, layout=layout)

    iplot(fig)

In [None]:
plot_result(info='acc')
plot_result(info='loss')