In [20]:
import re

import torch.utils.data
from nltk.corpus import stopwords
import json

stop_words = stopwords.words('english')

In [21]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)


def clean_text(text: str):
    text = text.lower()
    text = remove_emoji(text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r"\S*@\S*\s?", '', text)
    text = re.sub(r"#\S*", "", text)
    text = re.sub(r"[^a-z^A-Z]", ' ', text)  # remove anything except letters
    text = re.sub(r"\s+", ' ', text)

    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text)
    text = re.sub(r'@(\w+)?', '', text)
    text = re.sub(r'#(\w+)?', '', text)

    word_list = [w for w in text.split() if w not in stop_words]
    text_clean = ''
    for w in word_list:
        text_clean += (w + ' ')
    if text_clean != '':
        return text_clean
    return ''

In [22]:
x_train_data_text, x_dev_data_text, x_test_data_text = [], [], []
x_train_data_valid_rate, x_dev_data_valid_rate, x_test_data_valid_rate = [], [], []
y_train_data, y_dev_data = [], []

with open('./project-data/tweet-train-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        # print(type(tweets), tweets)
        tweets = json.loads(event)
        text_event = ''
        valid_num = 0
        invalid_num = 0
        for k, v in tweets.items():
            if 'data' in v:
                valid_num += 1
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text
        x_train_data_text.append(text_event)
        x_train_data_valid_rate.append(valid_num / len(tweets))

In [23]:
with open('./project-data/tweet-dev-final.txt', 'r', encoding='utf-8') as f:
    tweet_all = f.readlines()
    for event in tweet_all:
        tweets = json.loads(event)
        text_event = ''
        valid_num = 0
        for k, v in tweets.items():
            if 'data' in v:
                valid_num += 1
                text = v['data'][0]['text']
                text = clean_text(text)
                text_event += text

        x_dev_data_text.append(text_event)
        x_dev_data_valid_rate.append(valid_num / len(tweets))

In [24]:
with open('./project-data/test.data.txt', 'r', encoding='utf-8') as f:
    id_all = f.readlines()
    for i in range(len(id_all)):
        ids = id_all[i][:-1].split(',')
        text_event = ''
        for j in range(len(ids)):
            file_path = './project-data/tweet-objects/' + ids[j] + '.json'
            # try:
            with open(file_path, 'r', encoding='utf-8') as f2:
                tweet = json.load(f2)
                text = tweet['text']
                text = clean_text(text)
                text_event += text
            # except Exception:
            #     pass
        x_test_data_text.append(text_event)

In [25]:
x_test_data_text

['covid spread thanks wcco station trust media provide true news pa used work ',
 'hate keep saying capitalism implode without virus crisi believe look changes week humanity tell people protesting trick question life life capitalism never pr state slave thought using historical materialism realiz perhaps may want rethink hate america much herculean task mind already made complicit regardless consequen bartering trading goods services capitalism labor power ',
 'q covid influenza viruses different q covid influenza viruses similar q medical interventions available covid influenza viruses ',
 'una de les q amp coronaviruses de la p gina web de q long incubation period covid aquesta informaci es basa sobre tot en un article publicat al new england journal medicine bassat en una mostr els autors diuen mean incubation period days confidence interval ci th p exercici pels nostres estudiants si tenim una distribuci lognormal amb mitjana esperan igual el percen ',
 'absolutely blame politician

In [26]:
with open('./project-data/dev.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    for label in label_all:
        if label[:-1] == 'rumour':
            y_dev_data.append(1)
        else:
            y_dev_data.append(0)

with open('./project-data/train.label.txt', 'r', encoding='utf-8') as f:
    label_all = f.readlines()
    # print(type(label_all[1][:-1]), label_all[1][:-1])
    for label in label_all:
        if label[:-1] == 'rumour':
            y_train_data.append(1)
        else:
            y_train_data.append(0)

In [27]:
len(x_train_data_text)
x_train_data_text

['regularly rinsing nose saline help prevent infection new coronavirus eating garlic help prevent infection new coronavirus vaccines pneumonia protect new coronavirus spraying alcohol chlorine body kill new coronavirus effective thermal scanners detecting people infected new coronavirus ultraviolet disinfection lamp kill new coronavirus hand dryers effective killing new coronavirus new coronavirus cannot transmitted mosquito bites taking hot bath prevent new coronavirus disease cold weather snow cannot kill new coronavirus covid virus transmitted areas hot humid climates drinking alcohol protect covid dangerous able hold breath seconds without coughing feeling discomfort mean free coronavirus disease covid lung disease recover coronavirus disease covid catching new coronavirus mean life exposing sun temperatures higher c degrees prevent coronavirus disease covid g mobile networks spread covid ',
 'french police chief killed attack devastating rt french police chief killed attack nothin

In [28]:
len(x_train_data_valid_rate)

1895

In [29]:
len(y_train_data)
y_train_data

[0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,


In [137]:
import csv
import os

train_tsv_file = './project-data/train.tsv'
train_tsv_columns = ['valid_rate', 'sentence', 'label']

dev_tsv_file = './project-data/dev.tsv'
dev_tsv_columns = ['valid_rate', 'sentence', 'label']

train_data = zip(x_train_data_valid_rate, x_train_data_text, y_train_data)
dev_data = zip(x_dev_data_valid_rate, x_dev_data_text, y_dev_data)

test_tsv_file = './project-data/test.tsv'
test_tsv_columns = ['sentence']
test_data = x_test_data_text




def transfer_txt_to_tsv(output_tsv_file, output_tsv_columns, data):
    with open(output_tsv_file, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output)
        tsv_output.writerow(output_tsv_columns)
        for r, s, label in data:
            tsv_output.writerow([r, s, label])


def transfer_txt_to_tsv_test(output_tsv_file, output_tsv_columns, data):
    with open(output_tsv_file, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output)
        tsv_output.writerow(output_tsv_columns)
        for sentence in data:
            if sentence:
                tsv_output.writerow([sentence])
            else:
                tsv_output.writerow(['not a valid sentence'])


if os.path.exists(train_tsv_file):
    os.remove(train_tsv_file)
if os.path.exists(dev_tsv_file):
    os.remove(dev_tsv_file)
if os.path.exists(test_tsv_file):
    os.remove(test_tsv_file)

transfer_txt_to_tsv(train_tsv_file, train_tsv_columns, train_data)
transfer_txt_to_tsv(dev_tsv_file, dev_tsv_columns, dev_data)
transfer_txt_to_tsv_test(test_tsv_file, test_tsv_columns, test_data)

In [40]:
from transformers import BertTokenizer
import pandas as pd


class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename).dropna().reset_index(drop=True)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen
        # self.labels = df['label'].astype('category').tolist()
        # self.texts = [tokenizer(text, padding='max_length',
        #                         max_length=512,
        #                         truncation=True,
        #                         return_tensors='pt') for text in df['sentence']]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence)  #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + [
            '[SEP]']  #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]  #Padding sentences
        else:
            tokens = tokens[:self.maxlen - 1] + ['[SEP]']  #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(
            tokens)  #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids)  #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [41]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = TrainDataset(filename='./project-data/train.tsv', maxlen=512)
dev_set = TrainDataset(filename='./project-data/dev.tsv', maxlen=512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size=4, num_workers=0)
dev_loader = DataLoader(dev_set, batch_size=4, num_workers=0)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [42]:
import torch
import torch.nn as nn
from transformers import BertModel


class SentimentClassifier(nn.Module):

    def __init__(self):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')

        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask=attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [43]:
torch.cuda.is_available()

True

In [44]:
gpu = 0  #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = SentimentClassifier()
net.cuda(gpu)  #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [45]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr=2e-5)

In [None]:
import time


def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):
    best_acc = 0
    st = time.time()
    for ep in range(max_eps):

        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if it % 100 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep,
                                                                                                             loss.item(),
                                                                                                             acc, (
                                                                                                                     time.time() - st)))
                st = time.time()

        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc


def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
num_epoch = 16

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

In [138]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen
        # self.labels = df['label'].astype('category').tolist()
        # self.texts = [tokenizer(text, padding='max_length',
        #                         max_length=512,
        #                         truncation=True,
        #                         return_tensors='pt') for text in df['sentence']]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']

        # Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence)  # Tokenize the sentence
        tokens = ['[CLS]'] + tokens + [
            '[SEP]']  # Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]  # Padding sentences
        else:
            tokens = tokens[:self.maxlen - 1] + ['[SEP]']  # Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(
            tokens)  # Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids)  # Converting the list to a pytorch tensor

        # Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask

In [139]:
test_set = TestDataset(filename='./project-data/test.tsv', maxlen=512)

# Creating intsances of training and development dataloaders
test_loader = DataLoader(test_set, batch_size=1, num_workers=0)

print("Done preprocessing testing data.")

Done preprocessing testing data.


In [142]:
def predict(net, test_loader, weight_file, result_file):
    # load weight
    net.load_state_dict(torch.load(weight_file))

    predictions = []
    classes = ["non-rumour", "rumour"]

    with torch.no_grad():
        for seq, attn_masks in test_loader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictions.append(soft_probs.cpu().numpy().squeeze())

# Write into csv file
    result = pd.DataFrame()
    result['Predicted'] = predictions
    result.to_csv(result_file,index=True,index_label='Id')

In [147]:
weight_file = 'sstcls_0.dat'
result_file = "bert_result_0.csv"
prediction = predict(net, test_loader, weight_file, result_file)