In [4]:
import re

import torch.utils.data
from nltk.corpus import stopwords
import json

stop_words = stopwords.words('english')

In [5]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)


def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r"\S*@\S*\s?", '', text)
    text = re.sub(r"#\S*", "", text)
    text = re.sub(r"[^a-z^A-Z]", ' ', text)  # remove anything except letters
    text = re.sub(r"\s+", ' ', text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)

    word_list = [w for w in text.split() if w not in stop_words]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    if text_clean != '':
        return text_clean
    return ''

In [6]:
tweet_covid_text = []

In [7]:
with open('./project-data/tweet-covid-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        valid_num = 0
        invalid_num = 0
        for k, v in tweets.items():
            if 'data' in v:
                valid_num += 1
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        tweet_covid_text.append(text_event)

In [8]:
tweet_covid_text

['according new york times warner bros wanted delay christopher nolan months ago due covid pandemic nolan refused listen discussions studio kept original release date keep nolan happy ',
 'hurricane hanna made landfall texas storm hitting section texas coast already reeling thousands coronavirus cases guess cause death hurricane covid good way ramp numbers ',
 'monkeys loose india stolen coronavirus blood samples ',
 'let play blind work fighting islamic religion covid last time washed ass water coronavirus originated clean every use govt educate kenyans like covid pandemic spitting ground spreads covid wonder many litres spit spread covid wuhan italy spain usa ',
 'trump felt comfortable comfortable said one supporter president another said coronavirus risk today day die today day die happy another said coronavirus risk today day die today day die great drink bleach get make choice people infect amp infect jhirmack clusternut check gov dime hey numbskull get covid today day die weeks 

In [9]:
import csv
import os

test_covid_bert_file = './project-data/tweet_covid_bert.tsv'
test_covid_bert_file_columns = ['sentence']
test_data = tweet_covid_text

def transfer_txt_to_tsv_test(output_tsv_file, output_tsv_columns, data):
    with open(output_tsv_file, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output)
        tsv_output.writerow(output_tsv_columns)
        for sentence in data:
            if sentence:
                tsv_output.writerow([sentence])
            else:
                tsv_output.writerow(['not a valid sentence'])

if os.path.exists(test_covid_bert_file):
    os.remove(test_covid_bert_file)

transfer_txt_to_tsv_test(test_covid_bert_file, test_covid_bert_file_columns, test_data)

In [10]:
from transformers import BertTokenizer
import pandas as pd
import torch
from torch.utils.data import DataLoader

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen
        # self.labels = df['label'].astype('category').tolist()
        # self.texts = [tokenizer(text, padding='max_length',
        #                         max_length=512,
        #                         truncation=True,
        #                         return_tensors='pt') for text in df['sentence']]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']

        # Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence)  # Tokenize the sentence
        tokens = ['[CLS]'] + tokens + [
            '[SEP]']  # Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]  # Padding sentences
        else:
            tokens = tokens[:self.maxlen - 1] + ['[SEP]']  # Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(
            tokens)  # Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids)  # Converting the list to a pytorch tensor

        # Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask

In [11]:
import torch
import torch.nn as nn
from transformers import BertModel


class CovidClassifier(nn.Module):

    def __init__(self):
        super(CovidClassifier, self).__init__()
        #Instantiating BERT model object
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')

        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask=attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [12]:
gpu = 0  #gpu ID
print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = CovidClassifier()
net.cuda(gpu)  #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [13]:
test_set = TestDataset(filename='./project-data/tweet_covid_bert.tsv', maxlen=512)

# Creating intsances of training and development dataloaders
test_loader = DataLoader(test_set, batch_size=1, num_workers=0)

print("Done preprocessing testing data.")

Done preprocessing testing data.


In [14]:
def predict(net, test_loader, weight_file, result_file):
    # load weight
    net.load_state_dict(torch.load(weight_file))

    predictions = []
    classes = ["non-rumour", "rumour"]

    with torch.no_grad():
        for seq, attn_masks in test_loader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictions.append(soft_probs.cpu().numpy().squeeze())

# Write into csv file
    result = pd.DataFrame()
    result['Predicted'] = predictions
    result.to_csv(result_file,index=True,index_label='Id')

In [15]:
weight_file = 'sstcls_7.dat'
result_file = "tweet_covid_result_bert.csv"
prediction = predict(net, test_loader, weight_file, result_file)