In [1]:
import os
import json
train_ids = open("data/train.data.txt", "r")
train_labels = open("data/train.label.txt", "r")
dev_ids = open("data/dev.data.txt", "r")
dev_labels = open("data/dev.label.txt", "r")



def read_ids_labels(train_ids, train_labels, path = "data/train_tweet/"):
    train_data = []
    train_y = []
    for train_ids_str, label in zip(train_ids.readlines(), train_labels.readlines()):
        train_ids_list = train_ids_str.strip().split(",")
        temp_json_list = []
        if os.path.exists(path + train_ids_list[0] + ".json"):
            for train_id in train_ids_list:
                train_path = path + train_id + ".json"
                if os.path.exists(train_path):
                    tweet_json = json.load(open(train_path, "r"))
                    if tweet_json not in temp_json_list:
                        temp_json_list.append(tweet_json)
                    # while tweet json has reference tweets, keep adding them to the list
                    while "referenced_tweets" in tweet_json:
                        referenced_tweets_id = tweet_json["referenced_tweets"][0]["id"]
                        if os.path.exists(path + referenced_tweets_id + ".json"):
                            tweet_json = json.load(open(path + referenced_tweets_id + ".json", "r"))
                            if tweet_json not in temp_json_list:
                                temp_json_list.append(tweet_json)
                        else:
                            break

            # sort the list by time
            temp_json_list = sorted(temp_json_list, key=lambda x: x["created_at"])
            train_data.append(temp_json_list)
            train_y.append(label)
    return train_data, train_y

train_set, train_label = read_ids_labels(train_ids, train_labels)
dev_set, dev_label = read_ids_labels(dev_ids, dev_labels, path="data/dev_tweet/")

In [7]:
test_ids = open("data/test.data.txt", "r")
test_set = []
for test_ids_str in test_ids.readlines():
    test_ids_list = test_ids_str.strip().split(",")
    temp_json_list = []
    for test_id in test_ids_list:
        test_path = "data/tweet-objects/" + test_id + ".json"
        if os.path.exists(test_path):
            tweet_json = json.load(open(test_path, "r"))
            if tweet_json not in temp_json_list:
                temp_json_list.append(tweet_json)
            # while tweet json has in_reply_to_status_id, keep adding them to the list
            while tweet_json["in_reply_to_status_id"] != None:
                in_reply_to_status_id = str(tweet_json["in_reply_to_status_id"])
                if os.path.exists("data/tweet-objects/" + in_reply_to_status_id + ".json"):
                    tweet_json = json.load(open("data/tweet-objects/" + in_reply_to_status_id + ".json", "r"))
                    if tweet_json not in temp_json_list:
                        temp_json_list.append(tweet_json)
                else:
                    break
    temp_json_list = sorted(temp_json_list, key=lambda x: x["created_at"])
    test_set.append(temp_json_list)

In [10]:
all_train_tweets = []
all_dev_tweets = []
all_test_tweets = []

train_id2index = {}
num_train_tweets = 0
for record in train_set:
    for tweet in record:
        all_train_tweets.append(tweet["text"])
        id = tweet["id"]
        train_id2index[id] = num_train_tweets
        num_train_tweets += 1

dev_id2index = {}
num_dev_tweets = 0
for record in dev_set:
    for tweet in record:
        all_dev_tweets.append(tweet["text"])
        id = tweet["id"]
        dev_id2index[id] = num_dev_tweets
        num_dev_tweets += 1

test_id2index = {}
num_test_tweets = 0
for record in test_set:
    for tweet in record:
        all_test_tweets.append(tweet["text"])
        id = tweet["id"]
        test_id2index[id] = num_test_tweets
        num_test_tweets += 1

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np

In [12]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize, max_df=0.8, min_df=3, max_features=6000)
tfidf_train_matrix = tfidf_vectorizer.fit_transform(all_train_tweets)
tfidf_dev_matrix = tfidf_vectorizer.transform(all_dev_tweets)
tfidf_test_matrix = tfidf_vectorizer.transform(all_test_tweets)

In [13]:
def extract_feature(record, split):
    source_tweet_id = record[0]["id"]
    if split == "train":
        row_index = train_id2index[source_tweet_id]
        tfidf_vec = tfidf_train_matrix[row_index].toarray()
        tfidf_vec = np.squeeze(tfidf_vec)
    elif split == "dev":
        row_index = dev_id2index[source_tweet_id]
        tfidf_vec = tfidf_dev_matrix[row_index].toarray()
        tfidf_vec = np.squeeze(tfidf_vec)
    elif split == "test":
        row_index = test_id2index[source_tweet_id]
        tfidf_vec = tfidf_test_matrix[row_index].toarray()
        tfidf_vec = np.squeeze(tfidf_vec)

    reply_tweet_vecs = []
    for tweet in record[1:]:
        tweet_id = tweet["id"]
        if split == "train":
            row_index = train_id2index[tweet_id]
            tfidf_vec = tfidf_train_matrix[row_index].toarray()
        elif split == "dev":
            tweet_index = dev_id2index[tweet_id]
            tfidf_vec = tfidf_dev_matrix[tweet_index].toarray()
        elif split == "test":
            tweet_index = test_id2index[tweet_id]
            tfidf_vec = tfidf_test_matrix[tweet_index].toarray()
        reply_tweet_vecs.append(tfidf_vec)
    if len(reply_tweet_vecs) < 1:
        reply_tweet_mean_vec = np.zeros(6000)
    else:
        reply_tweet_mean_vec = np.squeeze(np.stack(reply_tweet_vecs).mean(axis=0))

    return reply_tweet_mean_vec


In [14]:
train_vec = []
train_y = []
for record, label in zip(train_set, train_label):
    if record != []:
        train_vec.append(extract_feature(record, "train"))
        train_y.append(label)
train_matrix = np.stack(train_vec)

dev_vec = []
dev_y = []
for record, label in zip(dev_set, dev_label):
    if record != []:
        dev_vec.append(extract_feature(record, "dev"))
        dev_y.append(label)
dev_matrix = np.stack(dev_vec)

test_vec = []
for record in test_set:
    if record != []:
        test_vec.append(extract_feature(record, "test"))
test_matrix = np.stack(test_vec)

In [15]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report

cls = ComplementNB()
cls.fit(train_matrix, np.array(train_y))
pred = cls.predict(dev_matrix)
print(classification_report(dev_y, pred))

              precision    recall  f1-score   support

  nonrumour
       0.80      0.99      0.89       422
     rumour
       0.81      0.11      0.20       115

    accuracy                           0.80       537
   macro avg       0.81      0.55      0.54       537
weighted avg       0.81      0.80      0.74       537



## Predict on test set and write to file

In [None]:
pred = cls.predict(test_matrix)

# output csv file with index
with open("submission.csv", "w") as f:
    f.write("Id,Predicted\n")
    for i, label in enumerate(pred):
        f.write("{},{}\n".format(i, label))