In [None]:
! kill -9 -1

In [None]:
import json
import nltk
import string
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder 
import torch
import torch.nn as nn
from collections import defaultdict
import random
from tqdm import tqdm 

In [None]:
import torch

torch.cuda.empty_cache()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are {} GPUs available.".format(torch.cuda.device_count()))
    print("We will use GPU {}".format(torch.cuda.get_device_name(0)))
else:
    print("There is no GPU available, using the CPU instead!")
    device = torch.device("cpu")

读取文件

In [None]:
# Read the source and the reply into different lists

def read_file(text_file_name):
  ### 输入为train文件的路径，本函数将tweet原始text内容及reply text内容进行分开存放;
  
    original_s_text = []
    original_r_text = []
    original_ids = []

    count = 0
    with open(text_file_name, 'r') as f:
        for line in f:
          count += 1
          content = json.loads(line)

          original_ids.append(content[0]['id_str'])
          s_text_str = ""
          r_text_str = ""
          for item in content:
              if not item["in_reply_to_user_id_str"]:
                s_text_str = s_text_str + " " + item['text']   #字符串合并
              else:
                r_text_str = r_text_str + " " + item['text']
          
          original_s_text.append(s_text_str)
          original_r_text.append(r_text_str)

        print("There are {} events".format(count))

    return original_s_text, original_r_text, original_ids

train_file_name = 'train.data.jsonl'
train_tweet_s_texts, train_tweet_r_texts, train_ids = read_file(train_file_name)

print(len(train_tweet_s_texts))
print(len(train_tweet_r_texts))
print(len(train_ids))


In [None]:
train_tweet_s_texts[0]

In [None]:
train_tweet_r_texts[0]

In [None]:
#remove URL#
def remove_urls (text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text

#remove @XXX#
def remove_user(text):
    text = re.sub('@[^\s]*','', text, flags=re.MULTILINE)
    return text

# print(remove_user(train_tweet_texts[0]))

In [None]:
puns=[",",".","?","!"]

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

def pre_processing(tweet_texts):
    pre_processed_tweet = []
    for tweet in tweet_texts:
        re_tweet = remove_urls(tweet)
        u_re_tweet = remove_user(re_tweet)
        tokens = tokenizer.tokenize(u_re_tweet)
        text_str = ""
        for token in tokens:
            if token in string.punctuation and token not in puns:
                continue
            elif not token.isspace():
              text_str = text_str + " " + token
        pre_processed_tweet.append(text_str.strip())
    
    return pre_processed_tweet

train_processed_s_texts = pre_processing(train_tweet_s_texts)
train_processed_r_texts = pre_processing(train_tweet_r_texts)

print(len(train_processed_s_texts))
print(len(train_processed_r_texts))

In [None]:
train_processed_s_texts[0]

In [None]:
train_processed_r_texts[0]

In [None]:
! pip install transformers

In [None]:
# Loading the Bertweet Tokeinzer

from transformers import BertweetTokenizer

print("Loading the Bertweet Tokenizer!")
Bertweet_tokenizer = BertweetTokenizer.from_pretrained('vinai/bertweet-base')

In [None]:
! pip install emoji

In [None]:
def print_max_len(processed_texts):
    tweet_len = []
    for text in processed_texts:
        text_len = len(Bertweet_tokenizer.tokenize(text))
        tweet_len.append(text_len)
    
    print('The maximum length is {}'.format(max(tweet_len)))
    return tweet_len

tweets_s_len = print_max_len(train_processed_s_texts)
tweets_r_len = print_max_len(train_processed_r_texts)


In [None]:
import matplotlib.pyplot as plt
 

plt.bar(range(len(tweets_s_len)), tweets_s_len, color='rgb')
plt.show()

In [None]:
plt.bar(range(len(tweets_r_len)), tweets_r_len, color='rgb')
plt.show()

In [None]:
print('The Average Length of Source Tweet is {}'.format(np.mean(tweets_s_len)))
print('The Median Length of Source Tweet is {}'.format(np.median(tweets_s_len)))
print('The 80 percentile of Length of Source Length is {}'.format(np.quantile(tweets_s_len, 0.8)))

In [None]:
print('The Average Length of Reply Tweet is {}'.format(np.mean(tweets_r_len)))
print('The Median Length of Reply Tweet is {}'.format(np.median(tweets_r_len)))
print('The 80 percentile of Length of Reply Length is {}'.format(np.quantile(tweets_r_len, 0.8)))

In [None]:
# Using Bertweet Tokenzier

MAX_LENGTH = 128

def convert_para_to_id(contents, para_length):
    input_ids_list = []
    attentions_list = []
    for content in contents:
        encoded_con = Bertweet_tokenizer.encode_plus(content,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length = para_length, 
                                            pad_to_max_length=True,
                                            return_tensors = 'pt')
        input_ids_list.append(encoded_con["input_ids"])
        attentions_list.append(encoded_con["attention_mask"])
    return input_ids_list, attentions_list

train_s_ids_list, train_s_att_list = convert_para_to_id(train_processed_s_texts, MAX_LENGTH)
train_r_ids_list, train_r_att_list = convert_para_to_id(train_processed_r_texts, MAX_LENGTH)

print(len(train_s_ids_list))
print(len(train_r_ids_list))

In [None]:
# Read the Validation Data
dev_file_name = 'dev.data.jsonl'
dev_tweet_s_texts,dev_tweet_r_texts, dev_ids = read_file(dev_file_name)

In [None]:
dev_preprocessed_s_texts = pre_processing(dev_tweet_s_texts)

print(len(dev_preprocessed_s_texts))
print(dev_preprocessed_s_texts[0])

In [None]:
dev_preprocessed_r_texts = pre_processing(dev_tweet_r_texts)

print(len(dev_preprocessed_r_texts))
print(dev_preprocessed_r_texts[0])

In [None]:
dev_s_ids_list, dev_s_att_list = convert_para_to_id(dev_preprocessed_s_texts, MAX_LENGTH)
dev_r_ids_list, dev_r_att_list = convert_para_to_id(dev_preprocessed_r_texts, MAX_LENGTH)

print(len(dev_s_ids_list))
print(len(dev_r_ids_list))

In [None]:
# Now Encode the label

def load_label_file(label_file):
    with open(label_file, 'r') as file:
         labels = json.load(file)
         return labels

def labels_to_vec(labels, train_ids):
    label_list = []
    for id in train_ids:
        label_list.append(labels[id])
    le = LabelEncoder()
    vec = le.fit_transform(label_list)

    return vec

def get_label_vec(label_file, ids):
    labels = load_label_file(label_file)
    vecs = labels_to_vec(labels, ids)

    return vecs

train_labels_vec = get_label_vec("train.label.json", train_ids)
dev_labels_vec = get_label_vec('dev.label.json', dev_ids)

print(train_labels_vec.shape)
print(dev_labels_vec.shape)

Now Transform All Data into Tensors

In [None]:
def redefine_label_vec(labels_vec):
    vec_list = []
    for i in labels_vec:
        a = [0, 0]
        a[i] = 1
        vec_list.append(a)
    return vec_list

train_labels_re_vec = redefine_label_vec(train_labels_vec)
dev_labels_re_vec = redefine_label_vec(dev_labels_vec)

print(len(train_labels_re_vec))
print(len(dev_labels_re_vec))

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

train_s_input_ids = torch.cat(train_s_ids_list, dim=0)
train_s_att_masks = torch.cat(train_s_att_list, dim=0)
train_r_input_ids = torch.cat(train_r_ids_list, dim=0)
train_r_att_masks = torch.cat(train_r_att_list, dim=0)
train_labels = torch.Tensor(train_labels_re_vec)

train_data = TensorDataset(train_s_input_ids, train_s_att_masks, train_r_input_ids, train_r_att_masks, train_labels)

batch_size = 50

train_loader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
print(len(train_loader))

In [None]:
val_s_input_ids = torch.cat(dev_s_ids_list, dim=0)
val_s_att_masks = torch.cat(dev_s_att_list, dim=0)
val_r_input_ids = torch.cat(dev_r_ids_list, dim=0)
val_r_att_masks = torch.cat(dev_r_att_list, dim=0)
val_labels = torch.Tensor(dev_labels_re_vec)


val_data = TensorDataset(val_s_input_ids, val_s_att_masks, val_r_input_ids, val_r_att_masks, val_labels)

val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)
print(len(val_loader))


In [None]:
import torch.nn as nn
from transformers import AutoModel, AdamW


class RumourDetector(nn.Module):
    def __init__(self, sent_length, input_dim, kernel_size, hidden_dim, output_dim, dropout):
       super().__init__()
       self.bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
       self.ConvEncoderLayer_1 = nn.Conv1d(in_channels=input_dim, out_channels=input_dim,
                                           kernel_size=kernel_size)
      #  self.ConvEncoderLayer_2 = nn.Conv1d(in_channels=input_dim, out_channels=input_dim,
      #                                      kernel_size=kernel_size[1])
       
       self.maxpool = nn.MaxPool1d(kernel_size= sent_length- kernel_size + 1)
       self.encoder = nn.Linear(2*input_dim, 2*hidden_dim)
       self.predictor = nn.Linear(2*hidden_dim, output_dim)
       self.dropout = nn.Dropout(dropout)


    def forward(self, s_input_ids, s_attention_mask, r_input_ids, r_attention_mask):
        s_bertweetlayer = self.bertweet(s_input_ids, attention_mask=s_attention_mask)
        r_bertweetlayer = self.bertweet(r_input_ids, attention_mask=r_attention_mask)

        # concat_bertweetlayer = torch.cat((s_bertweetlayer[0], r_bertweetlayer[0]), 1)
        s_bertweet_tensor = s_bertweetlayer[0].permute(0, 2, 1)
        conv_bertweet_s_layer = self.ConvEncoderLayer_1(s_bertweet_tensor)
        # conv_bertweet_layer2 = self.ConvEncoderLayer_2(conv_bertweet_layer1)
        s_pooled_tensor = self.maxpool(conv_bertweet_s_layer)
        s_pooled_tensor = s_pooled_tensor.squeeze(2)

        r_bertweet_tensor = r_bertweetlayer[0].permute(0, 2, 1)
        conv_bertweet_r_layer = self.ConvEncoderLayer_1(r_bertweet_tensor)
        r_pooled_tensor = self.maxpool(conv_bertweet_r_layer)
        r_pooled_tensor = r_pooled_tensor.squeeze(2)

        pooled_tensor = torch.cat((s_pooled_tensor, r_pooled_tensor), 1)
        output = self.encoder(pooled_tensor)
        output = self.dropout(output)
        result = self.predictor(output)

        return result

In [None]:
model = RumourDetector(128, 768, 3, 768, 2, 0.1)

model.cuda()

In [None]:
from transformers import AdamW

learning_rate = 2e-5


optimizer = AdamW(model.parameters(), lr = learning_rate, eps=1e-8)
print("Optimizer Loading Completed!...")

In [None]:
criterion = nn.BCEWithLogitsLoss()

criterion.cuda()

In [None]:
def calculate_F1_score(preds, y):
    true_label = y.argmax(dim=1)
    preds = preds.argmax(dim=1)

    tp = (true_label * preds).sum().to(torch.float32)
    tn = ((1 - true_label) * (1 - preds)).sum().to(torch.float32)
    fp = ((1 - true_label) * preds).sum().to(torch.float32)
    fn = (true_label * (1 - preds)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon) 

    return f1

In [None]:
# Now training
import random
import numpy as np

epochs = 20
best_valid_score = 0
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

training_stats = [] # used to store the training information


for epoch_i in range(epochs):
    
    print("")
    print("======== Epoch {:} / {:} ========".format(epoch_i + 1, epochs))
    print("Training...")
    
    total_train_loss = 0
    
    model.train()
    
    for step, batch in enumerate(train_loader):
        
        if (step + 1) % 10 == 0 and not step == 0:
            print("Batch {} of {}".format(step+1, len(train_loader)))
        
        batch_s_input_ids = batch[0].cuda()
        batch_s_input_mask = batch[1].cuda()
        batch_r_input_ids = batch[2].cuda()
        batch_r_input_mask = batch[3].cuda()
        batch_labels = batch[4].cuda()

        model.zero_grad() 
        preds = model(batch_s_input_ids, batch_s_input_mask, batch_r_input_ids, batch_r_input_mask)
        loss = criterion(preds, batch_labels)
        loss = loss.float()
        total_train_loss += loss.item()

        loss.backward()
        
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
        
        optimizer.step()
        
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    print("")
    print(" Average Training Loss is {:2f}".format(avg_train_loss))
    
    # Now perform validation
    
    print("")
    print("Running Validation...")
    
    model.eval()
    
    total_eval_score = 0
    total_eval_loss = 0
    val_preds = []
    val_labels = []

    for batch in val_loader:


        val_s_input_ids = batch[0].cuda()
        val_s_input_mask = batch[1].cuda()
        val_r_input_ids = batch[2].cuda()
        val_r_input_mask = batch[3].cuda()
        val_labels = batch[4].cuda()

        
        with torch.no_grad():
            val_preds = model(val_s_input_ids, val_s_input_mask, val_r_input_ids, val_r_input_mask)
        loss = criterion(val_preds, val_labels)
        total_eval_loss += loss.item()

        total_eval_score += calculate_F1_score(val_preds, val_labels)
        
    avg_val_loss = total_eval_loss / len(val_loader)
    avg_val_score = total_eval_score / len(val_loader)

    print("Validation loss :{}".format(avg_val_loss))
    print("The Score is {}".format(avg_val_score))

    if avg_val_score > best_valid_score:
        best_valid_score = avg_val_score
        torch.save(model.state_dict(), 'model.pt')
    

print("")
print("Training Complete!...")

Now we make prediction on the test dataset!

In [None]:
test_file_name = 'test.data.jsonl'

def read_file(text_file_name):
  ### 输入为train文件的路径，本函数将tweet原始text内容及reply text内容进行分开存放;
  
    original_s_text = []
    original_r_text = []
    original_ids = []

    count = 0
    with open(text_file_name, 'r') as f:
        for line in f:
          count += 1
          content = json.loads(line)

          original_ids.append(content[0]['id_str'])
          s_text_str = ""
          r_text_str = ""
          for item in content:
              if not item["in_reply_to_user_id_str"]:
                s_text_str = s_text_str + " " + item['text']   #字符串合并
              else:
                r_text_str = r_text_str + " " + item['text']
          
          original_s_text.append(s_text_str)
          original_r_text.append(r_text_str)

        print("There are {} events".format(count))

    return original_s_text, original_r_text, original_ids


puns=[",",".","?","!"]

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

def pre_processing(tweet_texts):
    pre_processed_tweet = []
    for tweet in tweet_texts:
        re_tweet = remove_urls(tweet)
        u_re_tweet = remove_user(re_tweet)
        tokens = tokenizer.tokenize(u_re_tweet)
        text_str = ""
        for token in tokens:
            if token in string.punctuation and token not in puns:
                continue
            elif not token.isspace():
              text_str = text_str + " " + token
        pre_processed_tweet.append(text_str.strip())
    
    return pre_processed_tweet


test_tweet_s_texts,test_tweet_r_texts, test_ids = read_file(test_file_name)

test_preprocessed_s_texts = pre_processing(test_tweet_s_texts)
test_preprocessed_r_texts = pre_processing(test_tweet_r_texts)
print(len(test_preprocessed_s_texts))
print(len(test_preprocessed_r_texts))

In [None]:
def convert_para_to_id(contents, para_length):
    input_ids_list = []
    attentions_list = []
    for content in contents:
        encoded_con = Bertweet_tokenizer.encode_plus(content,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length = para_length, 
                                            pad_to_max_length=True,
                                            return_tensors = 'pt')
        input_ids_list.append(encoded_con["input_ids"])
        attentions_list.append(encoded_con["attention_mask"])
    return input_ids_list, attentions_list

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

MAX_LENGTH = 128
test_s_ids_list, test_s_att_list = convert_para_to_id(test_preprocessed_s_texts, MAX_LENGTH)
test_r_ids_list, test_r_att_list = convert_para_to_id(test_preprocessed_r_texts, MAX_LENGTH)

test_s_input_ids = torch.cat(test_s_ids_list, dim=0)
test_s_att_masks = torch.cat(test_s_att_list, dim=0)
test_r_input_ids = torch.cat(test_r_ids_list, dim=0)
test_r_att_masks = torch.cat(test_r_att_list, dim=0)

test_batch_size = 83
test_data = TensorDataset(test_s_input_ids, test_s_att_masks, test_r_input_ids, test_r_att_masks)

test_loader = DataLoader(test_data, shuffle=False, batch_size=test_batch_size)
print(len(test_loader))


In [None]:
model.load_state_dict(torch.load('model.pt'))
model.to(device)

In [None]:
model.eval()

pred_labels = []
with torch.no_grad():

    for batch in test_loader:
        test_s_input_ids = batch[0].cuda()
        test_s_input_att_masks = batch[1].cuda()
        test_r_input_ids = batch[2].cuda()
        test_r_input_att_masks = batch[3].cuda()

        outputs = model(test_s_input_ids, test_s_input_att_masks, test_r_input_ids, test_r_input_att_masks)
        outputs = outputs.argmax(dim=1)
        preds = outputs.detach().cpu().numpy()
        for label in preds:
            pred_labels.append(label)


In [None]:
print(len(pred_labels))

In [None]:
test_labels = []
for label in pred_labels:
    if label == 0:
       test_labels.append('non-rumour')
    else:
      test_labels.append('rumour')

print(len(test_labels))

test_dict = {}
for i in range(len(test_ids)):
    test_dict[test_ids[i]] = test_labels[i]

print(len(test_dict))

json_str = json.dumps(test_dict)
with open ('test-output.json', 'w') as json_file:
    json_file.write(json_str)

 Now we Perform Analysis on COVID-19 Tweet Dataset

In [None]:
covid_file_name = 'covid.data.jsonl'


covid_tweet_s_texts,covid_tweet_r_texts, covid_ids = read_file(covid_file_name)

covid_preprocessed_s_texts = pre_processing(covid_tweet_s_texts)
covid_preprocessed_r_texts = pre_processing(covid_tweet_r_texts)

covid_s_ids_list, covid_s_att_list = convert_para_to_id(covid_preprocessed_s_texts, MAX_LENGTH)
covid_r_ids_list, covid_r_att_list = convert_para_to_id(covid_preprocessed_r_texts, MAX_LENGTH)
print(len(covid_preprocessed_s_texts))
print(len(covid_preprocessed_r_texts))


In [None]:
covid_s_input_ids = torch.cat(covid_s_ids_list, dim=0)
covid_s_att_masks = torch.cat(covid_s_att_list, dim=0)
covid_r_input_ids = torch.cat(covid_r_ids_list, dim=0)
covid_r_att_masks = torch.cat(covid_r_att_list, dim=0)

covid_batch_size = 
covid_data = TensorDataset(covid_s_input_ids, covid_s_att_masks, covid_r_input_ids, covid_r_att_masks)

covid_loader = DataLoader(covid_data, shuffle=False, batch_size=covid_batch_size)
print(len(covid_loader))

In [None]:
model.eval()

with torch.no_grad():

    for batch in covid_loader:
        covid_s_input_ids = batch[0]
        covid_s_input_att_masks = batch[1]
        covid_r_input_ids = batch[2]
        covid_r_input_att_masks = batch[3]

        outputs = model(covid_s_input_ids, covid_s_input_att_masks, covid_r_input_ids, covid_r_input_att_masks)
        outputs = outputs.argmax(dim=1)
        preds = outputs.detach().cpu().numpy()

covid_labels = []
for label in preds:
    if label == 0:
       covid_labels.append('non-rumour')
    else:
      covid_labels.append('rumour')

print(len(covid_labels))

covid_dict = {}
for i in range(len(covid_ids)):
    covid_dict[covid_ids[i]] = covid_labels[i]

print(len(covid_dict))

