## Crawl tweets by API

In [1]:
# ! pip install tqdm
# ! pip install tweepy
# ! pip install torch
# ! pip install transformers

## Crawl tweets by API

If you do not have data, uncomment main() to crawl tweets

In [2]:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_bearer_token.py
import requests
import json
import time
from tqdm import tqdm

# To set your bearer token:
bearer_token = "AAAAAAAAAAAAAAAAAAAAAGdZbgEAAAAAlXMiIg%2F96Ygnv%2FmvFDMsWb6LuSw%3DPTSIRz5g0G9RaB9pxp8QhdTtHxXnhEZsjLkpNyqQBR8EfRy8WS"


def create_url(ids):
    tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    ids = "ids=" + ids
    # print(ids)
    # You can adjust ids to include a single Tweets.
    # Or you can add to up to 100 comma-separated IDs
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def crawl_and_save(f_in, f_out):
    train_id_list = []
    for l in f_in.readlines():
        train_id_list.extend(l.strip().split(","))
    start_id = 0
    end_id = start_id + 100
    train_id_len = len(train_id_list)
    # max 100 tweet
    split_crawl = []
    while start_id < train_id_len:
        split_crawl.append(",".join(train_id_list[start_id:end_id]))
        start_id = end_id
        end_id = start_id + 100

    crawl_count = 0
    for ids in tqdm(split_crawl):
        url = create_url(ids)
        json_response = connect_to_endpoint(url)
        for x in json_response["data"]:
            json.dump(x, open(f_out + str(x["id"]) + ".json", "w"))
        crawl_count += 1
        if crawl_count % 290 == 0:
            time.sleep(790)

# un-comment to crawl tweets
def main():
    print("crawl the train tweets")
    #crawl_and_save(open("data/train.data.txt", "r"), "data/train_tweet/")
    # print("crawl the dev tweets")
    # crawl_and_save(open("data/dev.data.txt", "r"), "data/dev_tweet/")
    # print("crawl the analysis tweets")
    # crawl_and_save(open("data/covid.data.txt", "r"), "data/analysis_tweet/")
    print("Finished!")

if __name__ == "__main__":
    main()


crawl the train tweets
Finished!


## Dataset read-in

Read-in tweets and labels, then sort one tweet with retweets by time

In [1]:
import os
import json
import time
train_ids = open("data/train.data.txt", "r")
train_labels = open("data/train.label.txt", "r")
dev_ids = open("data/dev.data.txt", "r")
dev_labels = open("data/dev.label.txt", "r")

def read_ids_labels(ids, labels):
    train_set = []
    train_label = []
    for train_ids_str, label in zip(ids.readlines(), labels.readlines()):
        train_ids_list = train_ids_str.strip().split(",")
        temp_json_list = []
        if os.path.exists("data/train_tweet/" + train_ids_list[0] + ".json"):
            for train_id in train_ids_list:
                train_path = "data/train_tweet/" + train_id + ".json"
                if os.path.exists(train_path):
                    temp_json_list.append(json.load(open(train_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')))
        train_set.append(temp_json_list)
        if label.strip() == "rumour":
            train_label.append(1)
        else:
            train_label.append(0)

    return train_set, train_label

train_set, train_label = read_ids_labels(train_ids, train_labels)
dev_set, dev_label = read_ids_labels(dev_ids, dev_labels)

In [2]:
test_ids = open("data/test.data.txt", "r")
test_set = []
for test_ids_str in test_ids.readlines():
    test_ids_list = test_ids_str.strip().split(",")
    temp_json_list = []
    for test_id in test_ids_list:
        test_path = "data/tweet-objects/" + test_id + ".json"
        if os.path.exists(test_path):
            tweet_json = json.load(open(test_path, "r"))
            if tweet_json not in temp_json_list:
                temp_json_list.append(tweet_json)
            # while tweet json has in_reply_to_status_id, keep adding them to the list
            while tweet_json["in_reply_to_status_id"] != None:
                in_reply_to_status_id = str(tweet_json["in_reply_to_status_id"])
                if os.path.exists("data/tweet-objects/" + in_reply_to_status_id + ".json"):
                    tweet_json = json.load(open("data/tweet-objects/" + in_reply_to_status_id + ".json", "r"))
                    if tweet_json not in temp_json_list:
                        temp_json_list.append(tweet_json)
                else:
                    break
    temp_json_list = sorted(temp_json_list, key=lambda x: x["created_at"])
    test_set.append(temp_json_list)

# Bert model

In [3]:
#load pretrained bert base model
from transformers import BertModel, BertTokenizer

bert_model = BertModel.from_pretrained('bert-base-uncased')

#load BERT's WordPiece tokenisation model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Combine a tweet and its retweets into one string

In [4]:
def combine_tweet_retweet(train_set):
    all_tweets = []
    all_tokens = []
    T = 512
    all_padded_tokens = []

    for tweets in train_set:
        tweets_list = []
        for tweet in tweets:
            text = tweet["text"]
            text_list = []
            # replace @user and http
            for word in text.split(" "):
                if len(word) > 1 and word[0] == "@":
                    text_list.append("@")
                elif len(word) > 4 and word[0:4] == "http":
                    text_list.append("HTTP")
                else:
                    text_list.append(word)
            new_text = " ".join(text_list)
            tweets_list.append(new_text)
        new_text = "[CLS]" + "[SEP]".join(tweets_list) + "[SEP]"
        all_tweets.append(new_text)
        tokens = tokenizer.tokenize(new_text)
        all_tokens.append(tokens)
        # pad tokens
        padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
        all_padded_tokens.append(padded_tokens)
        # attention mask
        attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
        # seg id
        seg_ids = [0 for _ in range(len(padded_tokens))]
        # token id
        token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

    return all_tweets, all_tokens, padded_tokens, attn_mask, seg_ids, token_ids

train_text, train_tokens, train_padded_tokens, train_attn_mask, train_seg_ids, train_token_ids = combine_tweet_retweet(train_set)
dev_text, dev_tokens, dev_padded_tokens, dev_attn_mask, dev_seg_ids, dev_token_ids = combine_tweet_retweet(dev_set)
test_text, test_tokens, test_padded_tokens, test_attn_mask, test_seg_ids, test_token_ids = combine_tweet_retweet(test_set)

In [5]:
import torch

#Converting all the input vectors to torch tensors
token_ids_t = torch.tensor(train_token_ids).unsqueeze(0) #Shape : [1, 12]
attn_mask_t = torch.tensor(train_attn_mask).unsqueeze(0) #Shape : [1, 12]
seg_ids_t   = torch.tensor(train_seg_ids).unsqueeze(0) #Shape : [1, 12]

#Feed them to bert and get the contextualised embeddings
outputs = bert_model(token_ids_t, attention_mask = attn_mask_t,\
                                  token_type_ids = seg_ids_t, return_dict=True)
hidden_reps = outputs.last_hidden_state
print(hidden_reps.shape)
print(hidden_reps[0, 0, :10])

torch.Size([1, 512, 768])
tensor([ 0.0340, -0.0099,  0.0684,  0.1292, -0.4194, -0.6827,  0.4952,  0.3319,
         0.0633, -0.5607], grad_fn=<SliceBackward>)


In [6]:
import pandas as pd
# make a dataframe with the tweet text and labels
train_df = pd.DataFrame({"text": train_text, "label": train_label})
dev_df = pd.DataFrame({"text": dev_text, "label": dev_label})
test_df = pd.DataFrame({"text": test_text})

In [7]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SSTDataset(Dataset):

    def __init__(self, df, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = df

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [8]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(train_df, maxlen = 512)
dev_set = SSTDataset(dev_df, maxlen = 512)
#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 16, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 16, num_workers = 2)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [9]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')

        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [22]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = SentimentClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [14]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    best_ep = None
    st = time.time()
    for ep in range(max_eps):

        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if it % 100 == 0:

                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()


        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))
            best_ep = ep
    return best_ep

def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
num_epoch = 5

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

#### If having best model, load it

In [50]:
torch.no_grad()
gpu = 0 #gpu ID

net = SentimentClassifier()
net.cuda(gpu) #Enable gpu support for the model

net.load_state_dict(torch.load('sstcls_3.dat'))
net.eval()
print("Load the best model.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load the best model.


In [56]:
def prediction_bert(test_text, maxlen = 512):
    from transformers import BertTokenizer

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    sentences_tensor = []
    attn_mask_ts = []

    for sentence in test_text:

        tokens = tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < maxlen:
            tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))] 
        else:
            tokens = tokens[:maxlen-1] + ['[SEP]'] 
        attn_mask_1 = [1 if token != '[PAD]' else 0 for token in tokens]
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        token_ids_tensor = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, maxlen]
        attn_mask_tensor = torch.tensor(attn_mask_1).unsqueeze(0) #Shape : [1, maxlen]

        sentences_tensor.append(token_ids_tensor)
        attn_mask_ts.append(attn_mask_tensor)
    
    
    predictions = []
    for i in range(len(attn_mask_ts)):

        prediction = net(sentences_tensor[i].cuda(gpu),attn_mask_ts[i].cuda(gpu))
        probs = torch.sigmoid(prediction.unsqueeze(-1))
        soft_probs = (probs > 0.5).long()
        predictions.append(soft_probs.squeeze().tolist())
        
    return predictions

predictions = prediction_bert(test_text)

In [54]:
# write sbmission file
with open("submission.csv", "w") as f:
    f.write("Id,Predicted\n")
    for i, label in enumerate(predictions):
        f.write("{},{}\n".format(i, label))