In [50]:
import json
import nltk
from nltk.tokenize import TweetTokenizer
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import csv

In [51]:
x_train_data, x_dev_data, x_test_data = [], [], []
y_train_data, y_dev_data = [], []

In [52]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

In [53]:
def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)

#     word_list = [w for w in text.split() if w not in stop_words]
#     text_clean = ''
#     for w in word_list:
#         text_clean += (w + ' ')
#     if text_clean != '':
#         return text_clean
#     return ''
    word_list = [w for w in text.split()]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    return text_clean
    

In [54]:
with open('./project-data/tweet-train-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_train_data.append(text_event)    


In [55]:
with open('./project-data/train.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    # print(type(label_all[1][:-1]), label_all[1][:-1])
    for label in label_all:
        if label[:-1] == 'rumour':
            y_train_data.append(1)
        else:
            y_train_data.append(0)

In [56]:
with open('./project-data/tweet-dev-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        tweets = json.loads(event)
        text_event = ''
        for k, v in tweets.items():
            if 'data' in v:
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_dev_data.append(text_event)

In [57]:
with open('./project-data/dev.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    for label in label_all:
        if label[:-1] == 'rumour':
            y_dev_data.append(1)
        else:
            y_dev_data.append(0)

In [61]:
# re test
strs = []
str_e = "COVID-19 Fact:\nAre hand dryers effective in killing the new🤣 coronavirus?\n\nShare the information with your loved ones and help keep them safe.\n\nSource : WHO\n#weatherbug #weather #knowbefore #wx #istayhomefor #wecan  #corona #quarantinelife #strongertogether #wewillprevail https://t.co/dsPnQLUpMy"
str_e = str_e.lower()
str_e = remove_emoji(str_e)
str_e = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', str_e, flags=re.MULTILINE)
str_e = re.sub(r'@(\w+)?', '', str_e, flags=re.MULTILINE)
str_e = re.sub(r'#(\w+)?','',str_e,flags=re.MULTILINE)
str_e = re.sub(r'[^\w\s]',' ',str_e)
strs.append(str_e)
strs

['covid 19 fact \nare hand dryers effective in killing the new coronavirus \n\nshare the information with your loved ones and help keep them safe \n\nsource   who\n           ']

In [58]:
# transfer txt to tsv
import csv

train_tsv_file = './project-data/train.tsv'
train_tsv_columns = ['sentence','label']

dev_tsv_file = './project-data/dev.tsv'
dev_tsv_columns = ['sentence','label']


train_data = zip(x_train_data, y_train_data)
dev_data = zip(x_dev_data, y_dev_data)

def ransfer_txt_to_tsv(output_tsv_file, output_tsv_columns, data):
    with open(output_tsv_file, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output)
        tsv_output.writerow(output_tsv_columns)
        for s, label in data:
            tsv_output.writerow([s, label]) 
ransfer_txt_to_tsv(train_tsv_file, train_tsv_columns, train_data)
ransfer_txt_to_tsv(dev_tsv_file, dev_tsv_columns, dev_data)


In [59]:
# bert tokenizer
from transformers import BertTokenizer
bt = BertTokenizer.from_pretrained('bert-base-uncased')

In [103]:
# autoTokenizer
import torch
from transformers import AutoModel, AutoTokenizer
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
# line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"


Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tensor([[    0,  4040,    90,   160,   255, 35006, 26940,  2612,    15,  1456,
             7,   429,  6814,   499, 12952,    10,   156,     5,     3,    22,
             2]])

In [112]:
# 
input_ids = torch.tensor([tokenizer.encode(str_e)])
input_ids

tensor([[    0, 15558,   733,    41,   803,  4691, 19110,  5433,    16,  1863,
             6,   127,  1456,   991,     6,  1731,    30,    44,  1093,   784,
            13,   272,   264,   106,  1309,  3182,    87,     2]])

In [104]:
str_e_token = tokenizer(str_e)
str_e_token

{'input_ids': [0, 15558, 733, 41, 803, 4691, 19110, 5433, 16, 1863, 6, 127, 1456, 991, 6, 1731, 30, 44, 1093, 784, 13, 272, 264, 106, 1309, 3182, 87, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [63]:
# remove stop words
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# stop_words = set(stopwords.words('english'))

# filtered_sentence = []
word_tokens = bt.tokenize(str_e)
 
# for w in word_tokens:
#     if w not in stop_words:
#         filtered_sentence.append(w)
word_tokens

['co',
 '##vid',
 '19',
 'fact',
 'are',
 'hand',
 'dry',
 '##ers',
 'effective',
 'in',
 'killing',
 'the',
 'new',
 'corona',
 '##virus',
 'share',
 'the',
 'information',
 'with',
 'your',
 'loved',
 'ones',
 'and',
 'help',
 'keep',
 'them',
 'safe',
 'source',
 'who']

In [64]:
# add [CLS] and [SEP]
tokens = ['[CLS]'] + word_tokens + ['[SEP]'] 
# add [PAD]
input_size = 512
if len(tokens) < input_size:
    #Padding token
    tokens = tokens + ['[PAD]' for _ in range(input_size - len(tokens))] 
else:
    # if tokens length > input_size, extract the first input_size-1 and add SEP
    tokens = tokens[:input_size-1] + ['[SEP]'] 

# bert tokenizer word embedding
import torch
tokens_ids_tensor = torch.tensor(bt.convert_tokens_to_ids(tokens)) 

# attn_mask = (tokens_ids_tensor != 0).long()
attn_mask = [1 if token != '[PAD]' else 0 for token in tokens]
seg_ids = [0 for _ in range(len(tokens))] 

In [119]:
# tweet tokenizer
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()

filtered_sentence_t = []
tokens_t = tt.tokenize(str_e)
 
for w in tokens_t:
    if w not in stop_words:
        filtered_sentence_t.append(w)
filtered_sentence_t

['covid',
 'fact',
 'hand',
 'dryers',
 'effective',
 'killing',
 'new',
 'coronavirus',
 'share',
 'information',
 'loved',
 'ones',
 'help',
 'keep',
 'safe',
 'source']