In [1]:
import os
import json
import time
train_ids = open("data/train.data.txt", "r")
train_labels = open("data/train.label.txt", "r")
dev_ids = open("data/dev.data.txt", "r")
dev_labels = open("data/dev.label.txt", "r")

def read_ids_labels(ids, labels, test = False):
    train_set = []
    train_label = []
    for train_ids_str, label in zip(ids.readlines(), labels.readlines()):
        train_ids_list = train_ids_str.strip().split(",")
        temp_json_list = []
        if os.path.exists("data/train_tweet/" + train_ids_list[0] + ".json"):
            for train_id in train_ids_list:
                train_path = "data/train_tweet/" + train_id + ".json"
                if os.path.exists(train_path):
                    temp_json_list.append(json.load(open(train_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')))
        train_set.append(temp_json_list)
        if label.strip() == "rumour":
            train_label.append(1)
        else:
            train_label.append(0)

    return train_set, train_label

train_set, train_label = read_ids_labels(train_ids, train_labels)
dev_set, dev_label = read_ids_labels(dev_ids, dev_labels)
print("Finished!")

Finished!


In [2]:
train_set[0]

[{'entities': {'urls': [{'start': 87,
     'end': 110,
     'url': 'https://t.co/q133xXBiwl',
     'expanded_url': 'https://twitter.com/ucoptempe/status/1250219116993974272/photo/1',
     'display_url': 'pic.twitter.com/q133xXBiwl'}],
   'hashtags': [{'start': 70, 'end': 86, 'tag': 'COVID19Malaysia'}]},
  'id': '1250219116993974272',
  'context_annotations': [{'domain': {'id': '123',
     'name': 'Ongoing News Story',
     'description': "Ongoing News Stories like 'Brexit'"},
    'entity': {'id': '1220701888179359745', 'name': 'COVID-19'}},
   {'domain': {'id': '30',
     'name': 'Entities [Entity Service]',
     'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'},
    'entity': {'id': '825047692124442624',
     'name': 'Food',
     'description': 'Food'}},
   {'domain': {'id': '30',
     'name': 'Entities [Entity Service]',
     'description': 'Entity Service top level domain, every item that is in Entity Service should be i

In [28]:
all_train_tweets = []
all_dev_tweets = []

train_id2index = {}
num_train_tweets = 0
for record in train_set:
    for tweet in record:
        all_train_tweets.append(tweet["text"])
        id = tweet["id"]
        train_id2index[id] = num_train_tweets
        num_train_tweets += 1

dev_id2index = {}
num_dev_tweets = 0
for record in dev_set:
    for tweet in record:
        all_dev_tweets.append(tweet["text"])
        id = tweet["id"]
        dev_id2index[id] = num_dev_tweets
        num_dev_tweets += 1

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np

In [30]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize, max_df=0.8, min_df=3, max_features=6000)
tfidf_train_matrix = tfidf_vectorizer.fit_transform(all_train_tweets)
tfidf_dev_matrix = tfidf_vectorizer.transform(all_dev_tweets)

In [31]:
def extract_feature(record, split):
    feature_vec = []
    feature_dict = {}
    source_tweet_id = record[0]["id"]
    if split == "train":
        row_index = train_id2index[source_tweet_id]
        tfidf_vec = tfidf_train_matrix[row_index].toarray()
        tfidf_vec = np.squeeze(tfidf_vec)
    else:
        source_tweet_index = dev_id2index[source_tweet_id]
        tfidf_vec = tfidf_dev_matrix[source_tweet_index].toarray()
        tfidf_vec = np.squeeze(tfidf_vec)

    reply_tweet_vecs = []
    for tweet in record[1:]:
        tweet_id = tweet["id"]
        if split == "train":
            row_index = train_id2index[tweet_id]
            tfidf_vec = tfidf_train_matrix[row_index].toarray()
        else:
            tweet_index = dev_id2index[tweet_id]
            tfidf_vec = tfidf_dev_matrix[tweet_index].toarray()
        reply_tweet_vecs.append(tfidf_vec)
    if len(reply_tweet_vecs) < 1:
        reply_tweet_mean_vec = np.zeros(6000)
    else:
        reply_tweet_mean_vec = np.squeeze(np.stack(reply_tweet_vecs).mean(axis=0))

    return reply_tweet_mean_vec


In [38]:
train_vec = []
train_y = []
for record, label in zip(train_set, train_label):
    if record != []:
        train_vec.append(extract_feature(record, "train"))
        train_y.append(label)
train_matrix = np.stack(train_vec)

dev_vec = []
dev_y = []
for record, label in zip(dev_set, dev_label):
    if record != []:
        dev_vec.append(extract_feature(record, "dev"))
        dev_y.append(label)
dev_matrix = np.stack(dev_vec)

In [41]:
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.metrics import classification_report

cls = ComplementNB()
cls.fit(train_matrix, np.array(train_y))
pred = cls.predict(dev_matrix)
print(classification_report(dev_y, pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        72
           1       0.00      0.00      0.00         3

    accuracy                           0.96        75
   macro avg       0.48      0.50      0.49        75
weighted avg       0.92      0.96      0.94        75



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
