In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
from treelib import Node, Tree
import networkx as nx

import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from IPython.core.debugger import set_trace

In [None]:
args = {
    'data_dir': '../rumor_detection_acl2017/twitter15/',
    'tweet_content_file': 'tweet_contents.txt',
    'tree_dir': 'tree',
    'label_file': 'label.txt',
    'n_splits': 5,
    'seed': 1234,
}

## Data

In [None]:
def create_text_processor():
    text_processor = TextPreProcessor(
            normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                       'time', 'date', 'number'],
            fix_html=True,
            segmenter="twitter",
            corrector="twitter",

            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=True,

            # tokenizer=SocialTokenizer(lowercase=True).tokenize,
            tokenizer=RegexpTokenizer(r'\w+').tokenize,

            dicts=[emoticons]
        )

    return text_processor

def remove_stopword(tokens):
    stop_words = stopwords.words('english')
#     stop_words.append('url')
    return [word for word in tokens if word not in stop_words]

def stemming(tokens, ps):
    tokens = [ps.stem(token) for token in tokens]
    return tokens

def lemmatizer(tokens, wn):
    tokens = [wn.lemmatize(token) for token in tokens]
    return tokens

def remove_last_url(tokens):
    if len(tokens) > 0 and tokens[-1] == 'url':
        return tokens[:-1]
    else:
        return tokens
    
def pre_process(s):
    text = s.content
    text = text.replace("\/", '/')
    text = text.lower()

    tokens = text_processor.pre_process_doc(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens, ps)
    tokens = lemmatizer(tokens, wn)
    # tokens = remove_last_url(tokens)
    n_grams = set.union(set(ngrams(tokens, 1)), set(ngrams(tokens, 2)))
    return n_grams

In [None]:
text_processor = create_text_processor()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

def load_tweet_content(tweet_content_file):
    content_df = pd.read_csv(tweet_content_file, sep='\t', header=None, names=['id', 'content'])
    content_df['n_grams'] = content_df.apply(pre_process, axis=1)
    content_dict = {row['id']:row['n_grams'] for i, row in content_df.iterrows()}
    
    return content_dict

In [None]:
def load_rumor_trees(tree_dir_path, content_dict):
    trees = {}
    for f in os.listdir(tree_dir_path):
        file_path = os.path.join(tree_dir_path, f)

        if os.path.isfile(file_path) and '.txt' in file_path:
            tree = Tree()
            root_id = int(f.split('.')[0])
            if root_id in content_dict:
                content = content_dict[root_id]
            else:
                content = set()
            tree.create_node(tag=root_id, identifier=root_id, data=(content, 0.0))
            with open(file_path, 'r') as file:
                for line in file:
                    line_arr = line.split("'")
                    if 'ROOT' not in line:
                        user1 = int(line_arr[1])
                        tweet1 = int(line_arr[3])
                        tweet1_time = float(line_arr[5])
                        user2 = int(line_arr[7])
                        tweet2 = int(line_arr[9])
                        tweet2_time = float(line_arr[11])
                        
                        if tweet2 not in tree.nodes:
                            if tweet2 in content_dict:
                                content = content_dict[tweet2]
                            else:
                                content = set()
                            tree.create_node(tag=tweet2, identifier=tweet2, parent=tweet1, data=(content, tweet2_time))
                
                trees[root_id] = tree
        
    return trees

In [None]:
def load_labels(label_file):
    label_df = pd.read_csv(label_file, sep=':', header=None, names=['label', 'id'])
    label_df['label'] = label_df['label'].map({'unverified': 0, 'non-rumor': 1, 'true': 2, 'false': 3})
    label_dict = {row['id']:row['label'] for i, row in label_df.iterrows()}
    
    return label_dict

In [None]:
def node_similarity(node1, node2, alpha=0):
    c1, t1 = node1.data
    c2, t2 = node2.data

    t = abs(t1 - t2)
    if len(c1) == 0 or len(c2) == 0:
        content_similarity = 0.0
    else:
        content_similarity = len(set.intersection(c1, c2)) / len(set.union(c1, c2))
        
#     return content_similarity
    return np.exp(-t) * content_similarity

def most_similarity_nodes(T1, T2):
    node_pairs = {}

    for node1 in T1.all_nodes_itr():
        highest_simil = -1
        similar_node = Node()
        for node2 in T2.all_nodes_itr():
            node_simil = node_similarity(node1, node2)
            if highest_simil < node_simil:
                highest_simil = node_simil
                similar_node = node2

        node_pairs[node1] = similar_node

    return node_pairs

def sub_tree_similarity(subtree1, subtree2, subtree_similarity_dict):
    root_similar = node_similarity(subtree1.get_node(subtree1.root), subtree2.get_node(subtree2.root))
    if subtree1.depth() == 0 or subtree2.depth() == 0:
        return root_similar
    else:
        children1 = subtree1.children(subtree1.root)
        children2 = subtree2.children(subtree2.root)

        nc_min = min(len(children1), len(children2))

        multiplication = 1.0
        for i in range(nc_min):
            child1 = children1[i]
            child2 = children2[i]
            if (child1.identifier, child2.identifier) in subtree_similarity_dict:
                child_similar = subtree_similarity_dict[(child1.identifier, child2.identifier)]
            else:
                child_tree1 = subtree1.subtree(child1.identifier)
                child_tree2 = subtree2.subtree(child2.identifier)

                child_similar = sub_tree_similarity(child_tree1, child_tree2, subtree_similarity_dict)

                subtree_similarity_dict[(child1.identifier, child2.identifier)] = child_similar
            multiplication *= (1 + child_similar)

        return root_similar * multiplication

def tree_similarity(T1, T2):
    subtree_similarity_dict = {}
    node_pairs_1 = most_similarity_nodes(T1, T2)
    tree_simil_1 = 0.0
    for node1, node2 in node_pairs_1.items():
        subtree1 = T1.subtree(node1.identifier)
        subtree2 = T2.subtree(node2.identifier)
        tree_simil_1 += sub_tree_similarity(subtree1, subtree2, subtree_similarity_dict)

    node_pairs_2 = most_similarity_nodes(T2, T1)
    tree_simil_2 = 0.0
    for node2, node1 in node_pairs_2.items():
        subtree1 = T1.subtree(node1.identifier)
        subtree2 = T2.subtree(node2.identifier)
        tree_simil_2 += sub_tree_similarity(subtree1, subtree2, subtree_similarity_dict)

    return tree_simil_1 + tree_simil_2

def similarity_matrix(tree_list1, tree_list2):
    similar_matrix = np.zeros((len(tree_list1), len(tree_list2)))

    for i, tree1 in enumerate(tree_list1):
        print(f'tree {i}')
        for j, tree2 in enumerate(tree_list2):
            similar_matrix[i, j] = tree_similarity(tree1, tree2)

    return similar_matrix

In [None]:
content_dict = load_tweet_content(os.path.join(args['data_dir'], args['tweet_content_file']))
trees = load_rumor_trees(os.path.join(args['data_dir'], args['tree_dir']), content_dict)
label_dict = load_labels(os.path.join(args['data_dir'], args['label_file']))
tree_list = []
label_list = []

for root, tree in trees.items():
    tree_list.append(tree)
    label_list.append(label_dict[root])
    
label_list = np.array(label_list)

### Extract kernel matrix

In [None]:
np.save(os.path.join(args['data_dir'], 'label_5.npy'), label_list)

In [None]:
kernel_matrix = similarity_matrix(tree_list, tree_list)
np.save(os.path.join(args['data_dir'], 'kernel_matrix_5.npy'), kernel_matrix)

## Train

In [None]:
kernel_matrix = np.load(os.path.join(args['data_dir'], 'kernel_matrix_4.npy'))
# label_list = np.load(os.path.join(args['data_dir'], 'label.npy'))

In [None]:
scaler = MinMaxScaler()
kernel_matrix = scaler.fit_transform(kernel_matrix)

In [None]:
splits = list(StratifiedKFold(n_splits=args['n_splits'], shuffle=True, random_state=args['seed']).split(tree_list, label_list))
for idx, (train_idx, val_idx) in enumerate(splits):
    predict_df = pd.DataFrame()
    for label in range(4):
        clf = svm.SVC(kernel='precomputed', probability=True)
#         clf = RandomForestClassifier(n_estimators=100)
        kernel_train = kernel_matrix[train_idx][:, train_idx]
        y_train = label_list[train_idx]
        y_train[y_train != label] = 4
        clf.fit(kernel_train, y_train)

        train_pred = clf.predict(kernel_train)
        print(f'Fold {idx}, train accuracy {accuracy_score(y_train, train_pred)}')

        kernel_test = kernel_matrix[val_idx][:, train_idx]
        y_pred = clf.predict_proba(kernel_test)
        predict_df[label] = y_pred[:, 0]
    
    y_test = label_list[val_idx]
    y_pred = predict_df.idxmax(axis=1).values
    print(f'Fold {idx}, test accuracy {accuracy_score(y_test, y_pred)}')

In [None]:
label_list = np.array(label_list)

train_idx, val_idx = train_test_split(range(len(tree_list)), test_size=0.3, random_state=0)
clf = svm.SVC(kernel='precomputed')

kernel_train = kernel_matrix[train_idx][:, train_idx]
y_train = label_list[train_idx]
clf.fit(kernel_train, y_train)

train_pred = clf.predict(kernel_train)
print(f'Train accuracy {accuracy_score(y_train, train_pred)}')

kernel_test = kernel_matrix[val_idx][:, train_idx]
y_pred = clf.predict(kernel_test)
y_test = label_list[val_idx]
print(f'Test accuracy {accuracy_score(y_test, y_pred)}')