In [18]:
# https://www.sciencedirect.com/science/article/pii/S0306457318307957

In [19]:
import os
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from operator import itemgetter
import torch

import gensim
from gensim.models import KeyedVectors
from tqdm import tqdm, trange, tqdm_notebook
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from plotly.offline import init_notebook_mode, iplot
import chart_studio.plotly as py
import plotly.graph_objects as go

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import networkx as nx
import random
import gc
from IPython.core.debugger import set_trace

gc.collect()
init_notebook_mode(connected=True)

In [20]:
args = {
    'dataset': 'Twitter', #Twitter/PHEME     
    'data_dir': '/data/rumor_detection/data/rumor_acl/rumor_detection_acl2017/twitter_all/', #Twitter
#     'data_dir': '/data/rumor_detection/data/pheme/pheme_v2_extend/pheme_twitter/', #PHEME
    'tweet_content_file': 'tweet_contents_merge_fix.txt',
    'tree_dir': 'tree',
    'label_file': 'label.txt',
    'user_info_file': 'user_info.txt',
    'w2v': 'twitter_preprocess_4_w2c_400.txt',

    'max_graph_size': 50,
    'n_splits': 5,
    'seed': 1234,
}

In [21]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(args['seed'])

In [22]:
word_vectors = KeyedVectors.load_word2vec_format(args['data_dir'] + args['w2v'], binary=False)
embed_dim = word_vectors.vector_size

In [23]:
def create_text_processor():
    text_processor = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'date', 'number'],
            fix_html=True,
            segmenter="twitter",
            corrector="twitter",

            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=True,

            # tokenizer=SocialTokenizer(lowercase=True).tokenize,
            tokenizer=RegexpTokenizer(r'\w+').tokenize,

            dicts=[emoticons]
        )

    return text_processor

def remove_stopword(tokens):
    stop_words = stopwords.words('english')
#     stop_words.append('url')
    return [word for word in tokens if word not in stop_words]

def stemming(tokens, ps):
    tokens = [ps.stem(token) for token in tokens]
    return tokens

def lemmatizer(tokens, wn):
    tokens = [wn.lemmatize(token) for token in tokens]
    return tokens

def remove_last_url(tokens):
    if len(tokens) > 0 and tokens[-1] == 'url':
        return tokens[:-1]
    else:
        return tokens
    
def pre_process(s):
    text = s.content
    text = text.replace("\/", '/')
    text = text.lower()

    tokens = text_processor.pre_process_doc(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens, ps)
    tokens = lemmatizer(tokens, wn)
    # tokens = remove_last_url(tokens)
    return tokens

In [24]:
text_processor = create_text_processor()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def extract_content_features(s):
    text = s.content
    text_len = len(text)
    capital_ratio = sum(1 for c in text if c.isupper()) / text_len
    question_marks = 1 if '?' in text else 0
    exclamation_marks = 1 if '!' in text else 0
    period_marks = 1 if '.' in text else 0
#     word_count = len(text.split())
    
    return torch.tensor([capital_ratio, question_marks, exclamation_marks, period_marks])

def load_tweet_content(tweet_content_file):
    def embed_content(s):
#         tokens = s.content_tokens
#         content_embedding = np.array([word_vectors[token] for token in tokens if token in word_vectors])
#         content_embedding = np.mean(content_embedding, axis=0)
#         if np.isnan(content_embedding).any():
#             content_embedding = np.zeros((embed_dim, ))
#         return content_embedding
    
        tokens = s.content_tokens
        content_embedding = torch.tensor([word_vectors[token] for token in tokens if token in word_vectors], dtype=torch.float)
        content_embedding = torch.mean(content_embedding, axis=0)
        if torch.isnan(content_embedding).any():
            content_embedding = torch.zeros((embed_dim, ))
        return content_embedding

    content_df = pd.read_csv(tweet_content_file, sep='\t', header=None, names=['id', 'content'])
    content_df['content_features'] = content_df.apply(extract_content_features, axis=1)
    content_df['word_count'] = content_df['content'].apply(lambda x: len(x.split()))
    min_max_scaler = preprocessing.MinMaxScaler()
    content_df['word_count'] = min_max_scaler.fit_transform(content_df[['word_count']].values.astype(float))
    
    content_df['content_tokens'] = content_df.apply(pre_process, axis=1)
    content_df['content_embedding'] = content_df.apply(embed_content, axis=1)
    content_dict = {row['id']:torch.cat((row['content_embedding'], row['content_features'], torch.tensor([row['word_count']]))) for i, row in content_df.iterrows()}
    
    return content_dict

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [25]:
def load_user_info(user_info_file):
    user_info_dict = {}
    user_df = pd.read_csv(user_info_file, sep='\t')
    user_df = user_df.drop('has_description', 1)
    user_df = user_df.drop('has_url', 1)

    min_max_scaler = preprocessing.MinMaxScaler()
    user_df['account_age'] = min_max_scaler.fit_transform(user_df[['account_age']].values.astype(float))
    user_df['status_count'] = min_max_scaler.fit_transform(user_df[['status_count']].values.astype(float))
    user_df['follower_count'] = min_max_scaler.fit_transform(user_df[['follower_count']].values.astype(float))
    user_df['friend_count'] = min_max_scaler.fit_transform(user_df[['friend_count']].values.astype(float))
    
    user_info_dict = {row['uid']: torch.tensor(row.values[1:], dtype=torch.float32) for i, row in user_df.iterrows()}
    return user_info_dict

In [26]:
def load_rumor_graphs(tree_dir_path, avai_ids):
    graphs = {}
    for f in os.listdir(tree_dir_path):
        file_path = os.path.join(tree_dir_path, f)

        if os.path.isfile(file_path) and '.txt' in file_path:
            G = nx.DiGraph()
            root_id = int(f.split('.')[0])
            if root_id not in avai_ids:
                continue

            tweet_level = {}
            tweet_ids = []
            G.add_node(root_id)
            tweet_level[root_id] = 0
            with open(file_path, 'r') as file:
                for line in file:
                    line_arr = line.split("'")
                    if 'ROOT' in line:
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        tweet2_time = float(line_arr[11])
                        
                        G.add_node(tweet2, user=user2, time=tweet2_time)
                        
                    elif 'ROOT' not in line:
                        user1 = int(line_arr[1])
                        tweet1 = int(line_arr[3])
                        tweet1_time = float(line_arr[5])
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        tweet2_time = float(line_arr[11])
                        
#                         if tweet1 in avai_ids and tweet2 in avai_ids:
                        if tweet1 not in tweet_level:
                            tweet_level[tweet1] = 0
                        if tweet2 not in tweet_level:
                            tweet_level[tweet2] = tweet_level[tweet1] + 1

                        if tweet1 != tweet2:
                            G.add_node(tweet1, user=user1, time=tweet1_time)
                            G.add_node(tweet2, user=user2, time=tweet2_time)
                            G.add_edge(tweet2, tweet1)
                
#                 tweet_ids.sort(key=itemgetter(1))
#                 tweet_ids = [tweet_id for (tweet_id, _) in tweet_ids]
                tweet_ids = [x for x, _ in sorted(tweet_level.items(), key=lambda kv: kv[1])]
                graphs[root_id] = (tweet_ids, G)
        
    return graphs

In [27]:
def load_labels(label_file):
    label_df = pd.read_csv(label_file, sep=':', header=None, names=['label', 'id'])
    if args['dataset'] == 'Twitter':
        label_df['label'] = label_df['label'].map({'unverified': 0, 'non-rumor': 1, 'true': 2, 'false': 3})
    elif args['dataset'] == 'PHEME':
        label_df['label'] = label_df['label'].map({'rumor': 0, 'non-rumor': 1})
    label_dict = {row['id']:row['label'] for i, row in label_df.iterrows()}
    
    return label_dict

## Model

In [28]:
content_dict = load_tweet_content(os.path.join(args['data_dir'], args['tweet_content_file']))
user_info_dict = load_user_info(os.path.join(args['data_dir'], args['user_info_file']))
graphs = load_rumor_graphs(os.path.join(args['data_dir'], args['tree_dir']), avai_ids=content_dict.keys())
label_dict = load_labels(os.path.join(args['data_dir'], args['label_file']))

extra_content_dim = 5
user_dim = 5
time_dim = 1
input_dim = embed_dim + extra_content_dim + user_dim + time_dim

In [29]:
max_time = 0
for root_id, (tweet_ids, G) in graphs.items():
    tweet_ids = tweet_ids[:args['max_graph_size']]
    for tweet_id in tweet_ids:
        tweet_time = G.nodes[tweet_id]['time']
        if max_time < tweet_time:
            max_time = tweet_time

In [30]:
graph_features = np.zeros((len(graphs), input_dim))
label = np.zeros((len(graphs), ))

for i, (root_id, (tweet_ids, G)) in enumerate(graphs.items()):
    pageranks = nx.pagerank(G, alpha=0.8)
    
    graph_vector = np.zeros((input_dim))
    
#     Only use source tweet
#     content_feature = content_dict[root_id] if root_id in content_dict else torch.zeros((embed_dim + extra_content_dim, ))
#     user_id = G.nodes[root_id]['user']
#     social_feature = user_info_dict[user_id] if user_id in user_info_dict else torch.zeros((user_dim))
#     time_feature = torch.tensor([G.nodes[root_id]['time']/max_time])
#     graph_vector = torch.cat((content_feature, social_feature, time_feature)).numpy()

    for tweet_id, rank in pageranks.items():
        content_feature = content_dict[tweet_id] if tweet_id in content_dict else torch.zeros((embed_dim + extra_content_dim, ))
        user_id = G.nodes[tweet_id]['user']
        social_feature = user_info_dict[user_id] if user_id in user_info_dict else torch.zeros((user_dim))
        time_feature = torch.tensor([G.nodes[tweet_id]['time']/max_time])
        node_vector = torch.cat((content_feature, social_feature, time_feature))
        
        graph_vector += node_vector.numpy() * rank
    
    graph_features[i] = graph_vector
    label[i] = label_dict[root_id]

In [31]:
splits = list(StratifiedKFold(n_splits=args['n_splits'], shuffle=True, random_state=args['seed']).split(graph_features, label))
acc_list = []
unverified_list = []
non_list = []
true_list = []
false_list = []
macro_list = []

for idx, (train_idx, val_idx) in enumerate(splits):
    X_train = graph_features[train_idx]
    X_test = graph_features[val_idx]
    y_train = label[train_idx]
    y_test = label[val_idx]
    
    clf = RandomForestClassifier(n_estimators=200, max_depth=6, n_jobs=8, random_state=args['seed'])
#     clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    acc_list.append(acc)
    
    if args['dataset'] == 'Twitter':
        f1 = f1_score(y_test, y_pred, labels=[0, 1, 2, 3], average=None)
        macro_f1 = f1_score(y_test, y_pred, average='macro')

        unverified_list.append(f1[0])
        non_list.append(f1[1])
        true_list.append(f1[2])
        false_list.append(f1[3])
        macro_list.append(macro_f1)
    elif args['dataset'] == 'PHEME':
        f1 = f1_score(y_test, y_pred, pos_label=0)
        precision = precision_score(y_test, y_pred, pos_label=0)
        recall = recall_score(y_test, y_pred, pos_label=0)

        true_list.append(precision)
        non_list.append(recall)
        macro_list.append(f1)
    
print('Unverified', np.mean(np.array(unverified_list)))
print('Non', np.mean(np.array(non_list)))
print('True', np.mean(np.array(true_list)))
print('False', np.mean(np.array(false_list)))
print('Macro F1', np.mean(np.array(macro_list)))

print('Acc', np.mean(np.array(acc_list)))

Unverified 0.46462694047084485
Non 0.733820055563141
True 0.7261458171574096
False 0.6294542666947303
Macro F1 0.6385117699715314
Acc 0.6649289204233859


In [32]:
# Unverified 0.5460368654645429
# Non 0.6849373410122241
# True 0.7702281783475089
# False 0.6253053197891162
# Macro F1 0.6566269261533481
# Acc 0.6715294641186025

In [33]:
# X_train, X_test, y_train, y_test = train_test_split(graph_features, label, test_size=0.3, random_state=args['seed'])
# clf = RandomForestClassifier(n_estimators=200, n_jobs=8, random_state=0)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# accuracy_score(y_test, y_pred)

In [34]:
# Unverified 0.6436432034878614
# Non 0.711834696815937
# True 0.7954398153498852
# False 0.6949399324737426
# Macro F1 0.7114644120318566
# Acc 0.7098709624195918

# Unverified 0.5885150162718416
# Non 0.6810329222431714
# True 0.7913320557485826
# False 0.6376347697709926
# Macro F1 0.6746286910086471
# Acc 0.6855363387287011

# Twitter15+Twitter16
# Pagerank
# Unverified 0.6184844667488563
# Non 0.6877254725833382
# True 0.8014874870187265
# False 0.6757390907424241
# Macro F1 0.6958591292733363
# Acc 0.7044000490634406

# Average
# Unverified 0.46462694047084485
# Non 0.733820055563141
# True 0.7261458171574096
# False 0.6294542666947303
# Macro F1 0.6385117699715314
# Acc 0.6649289204233859

# Only source post
# Unverified 0.6097501701076816
# Non 0.7750018202823165
# True 0.7915742768527358
# False 0.6878029802804813
# Macro F1 0.7160323118808037
# Acc 0.7298821487609796

# Source content embedding
# Unverified 0.5723346120196513
# Non 0.6842519991187694
# True 0.8007586753312985
# False 0.6431618353894495
# Macro F1 0.6751267804647922
# Acc 0.6875330608313014

# PHEME
# Pagerank
# Unverified nan
# Non 0.6985932085932086
# True 0.7886166154804164
# False nan
# Macro F1 0.7408175379459995
# Acc 0.8172784779713277

# Average
# Unverified nan
# Non 0.6373943173943173
# True 0.7704774794451561
# False nan
# Macro F1 0.6976272743850336
# Acc 0.7934665689619967

# Only source post
# Unverified nan
# Non 0.7123319473319473
# True 0.7999817221245831
# False nan
# Macro F1 0.7535466808589432
# Acc 0.8258383089136696