In [25]:
import re
from nltk.corpus import stopwords
import json


stop_words = stopwords.words('english')

In [26]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)


def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r"\S*@\S*\s?", '', text)
    text = re.sub(r"#\S*", "", text)
    text = re.sub(r"[^a-z^A-Z]", ' ', text) # remove anything except letters
    text = re.sub(r"\s+", ' ', text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)

    word_list = [w for w in text.split() if w not in stop_words]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    if text_clean != '':
        return text_clean
    return ''

In [27]:
x_train_data_text, x_dev_data_text, x_test_data_text = [], [], []
x_train_data_valid_rate, x_dev_data_valid_rate, x_test_data_valid_rate = [],[],[]
y_train_data, y_dev_data = [], []


with open('./project-data/tweet-train-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        valid_num = 0
        invalid_num = 0
        for k, v in tweets.items():
            if 'data' in v:
                valid_num +=1
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_train_data_text.append(text_event)
        x_train_data_valid_rate.append(valid_num/len(tweets))

In [28]:
with open('./project-data/tweet-dev-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        tweets = json.loads(event)
        text_event = ''
        valid_num = 0
        for k, v in tweets.items():
            if 'data' in v:
                valid_num +=1
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text


        x_dev_data_text.append(text_event)
        x_dev_data_valid_rate.append(valid_num/len(tweets))

In [29]:
with open('./project-data/dev.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    for label in label_all:
        if label[:-1] == 'rumour':
            y_dev_data.append(1)
        else:
            y_dev_data.append(0)

with open('./project-data/train.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    # print(type(label_all[1][:-1]), label_all[1][:-1])
    for label in label_all:
        if label[:-1] == 'rumour':
            y_train_data.append(1)
        else:
            y_train_data.append(0)

In [30]:
len(x_train_data_text)

1895

In [31]:
len(x_train_data_valid_rate)

1895

In [32]:
len(y_train_data)

1895

In [33]:
import csv

train_tsv_file = './project-data/train.tsv'
train_tsv_columns = ['valid_rate', 'sentence', 'label']

dev_tsv_file = './project-data/dev.tsv'
dev_tsv_columns = ['valid_rate', 'sentence', 'label']

train_data = zip(x_train_data_valid_rate, x_train_data_text, y_train_data)
dev_data = zip(x_dev_data_valid_rate, x_dev_data_text, y_dev_data)


def transfer_txt_to_tsv(output_tsv_file, output_tsv_columns, data):
    with open(output_tsv_file, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output)
        tsv_output.writerow(output_tsv_columns)
        for r,s, label in data:
            tsv_output.writerow([r,s, label])


transfer_txt_to_tsv(train_tsv_file, train_tsv_columns, train_data)
transfer_txt_to_tsv(dev_tsv_file, dev_tsv_columns, dev_data)