## Crawl tweets by API

In [None]:
# ! pip install tqdm
# ! pip install tweepy
# ! pip install torch
# ! pip install transformers
# ! pip install emoji

In [None]:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_bearer_token.py
import requests
import json
import time
from tqdm import tqdm

# To set your bearer token:
bearer_token = "AAAAAAAAAAAAAAAAAAAAAGdZbgEAAAAAlXMiIg%2F96Ygnv%2FmvFDMsWb6LuSw%3DPTSIRz5g0G9RaB9pxp8QhdTtHxXnhEZsjLkpNyqQBR8EfRy8WS"


def create_url(ids):
    tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    ids = "ids=" + ids
    # print(ids)
    # You can adjust ids to include a single Tweets.
    # Or you can add to up to 100 comma-separated IDs
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()


def crawl_and_save(f_in, f_out):
    train_id_list = []
    for l in f_in.readlines():
        train_id_list.extend(l.strip().split(","))
    start_id = 0
    end_id = start_id + 100
    train_id_len = len(train_id_list)
    # max 100 tweet
    split_crawl = []
    while start_id < train_id_len:
        split_crawl.append(",".join(train_id_list[start_id:end_id]))
        start_id = end_id
        end_id = start_id + 100

    crawl_count = 0
    for ids in tqdm(split_crawl):
        url = create_url(ids)
        json_response = connect_to_endpoint(url)
        for x in json_response["data"]:
            json.dump(x, open(f_out + str(x["id"]) + ".json", "w"))
        crawl_count += 1
        if crawl_count % 290 == 0:
            time.sleep(790)

# un-comment to crawl tweets
def main():
    print("crawl the train tweets")
    #crawl_and_save(open("data/train.data.txt", "r"), "data/train_tweet/")
    # print("crawl the dev tweets")
    # crawl_and_save(open("data/dev.data.txt", "r"), "data/dev_tweet/")
    # print("crawl the analysis tweets")
    # crawl_and_save(open("data/covid.data.txt", "r"), "data/analysis_tweet/")
    print("Finished!")

if __name__ == "__main__":
    main()


## Dataset read-in

Read-in tweets and labels, then sort one tweet with retweets by time

In [29]:
import os
import json
import time
train_ids = open("data/train.data.txt", "r")
train_labels = open("data/train.label.txt", "r")
dev_ids = open("data/dev.data.txt", "r")
dev_labels = open("data/dev.label.txt", "r")

def read_ids_labels(ids, labels, test = False):
    train_set = []
    train_label = []
    for train_ids_str, label in zip(ids.readlines(), labels.readlines()):
        train_ids_list = train_ids_str.strip().split(",")
        temp_json_list = []
        if os.path.exists("data/train_tweet/" + train_ids_list[0] + ".json"):
            for train_id in train_ids_list:
                train_path = "data/train_tweet/" + train_id + ".json"
                if os.path.exists(train_path):
                    temp_json_list.append(json.load(open(train_path, "r")))
        # sort according to time
        temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')))
        train_set.append(temp_json_list)
        if label.strip() == "rumour":
            train_label.append(1)
        else:
            train_label.append(0)

    return train_set, train_label

train_set, train_label = read_ids_labels(train_ids, train_labels)
dev_set, dev_label = read_ids_labels(dev_ids, dev_labels)
print("Finished!")

Finished!


In [33]:
print(train_set[0])

[{'entities': {'urls': [{'start': 87, 'end': 110, 'url': 'https://t.co/q133xXBiwl', 'expanded_url': 'https://twitter.com/ucoptempe/status/1250219116993974272/photo/1', 'display_url': 'pic.twitter.com/q133xXBiwl'}], 'hashtags': [{'start': 70, 'end': 86, 'tag': 'COVID19Malaysia'}]}, 'id': '1250219116993974272', 'context_annotations': [{'domain': {'id': '123', 'name': 'Ongoing News Story', 'description': "Ongoing News Stories like 'Brexit'"}, 'entity': {'id': '1220701888179359745', 'name': 'COVID-19'}}, {'domain': {'id': '30', 'name': 'Entities [Entity Service]', 'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'}, 'entity': {'id': '825047692124442624', 'name': 'Food', 'description': 'Food'}}, {'domain': {'id': '30', 'name': 'Entities [Entity Service]', 'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'}, 'entity': {'id': '824777229892661248', 'name': 'Generic Food', '

In [34]:
# read in test data
test_ids = open("data/test.data.txt", "r")
test_set = []
for test_ids_str in test_ids.readlines():
    test_ids_list = test_ids_str.strip().split(",")
    temp_json_list = []
    for test_id in test_ids_list:
        test_path = "data/tweet-objects/" + test_id + ".json"
        if os.path.exists(test_path):
            temp_json_list.append(json.load(open(test_path, "r")))

    # sort according to time
    temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], '%a %b %d %H:%M:%S +0000 %Y')))
    # temp_json_list = sorted(temp_json_list, key=lambda x: time.mktime(time.strptime(x["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')))
    test_set.append(temp_json_list)

print("Finished!")

Finished!


Try to use workshop 10 bert model

In [23]:
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW
import torch
pre_trained_weights = "vinai/bertweet-covid19-base-uncased"
bertweet = AutoModel.from_pretrained(pre_trained_weights)
tokenizer = AutoTokenizer.from_pretrained(pre_trained_weights, normalization=True)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Combine a tweet and its retweets into one string

In [24]:
def combine_tweet_retweet(train_set):
    all_tweets = []

    for tweets in train_set:
        tweets_list = []
        for tweet in tweets:
            text = tweet["text"]
            text_list = []
            # replace @user and http
            for word in text.split(" "):
                if len(word) > 1 and word[0] == "@":
                    text_list.append("@USER")
                elif len(word) > 4 and word[0:4] == "http":
                    text_list.append("HTTPURL")
                else:
                    text_list.append(word)
            new_text = " ".join(text_list)
            tweets_list.append(new_text)
        all_tweets.append(tokenizer.sep_token.join(tweets_list))

    return all_tweets

train_text = combine_tweet_retweet(train_set)
dev_text = combine_tweet_retweet(dev_set)
test_text = combine_tweet_retweet(test_set)

In [35]:
train_text[0]

'4. Can eating garlic help prevent infection with the new coronavirus? #COVID19Malaysia HTTPURL</s>5. Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? HTTPURL</s>6. Do vaccines against pneumonia protect you against the new coronavirus? HTTPURL</s>7. Can spraying alcohol or chlorine all over your body kill the new coronavirus? #Chamber HTTPURL</s>8. How effective are thermal scanners in detecting people infected with the new coronavirus? HTTPURL</s>9. Can an ultraviolet disinfection lamp kill the new coronavirus? HTTPURL</s>10. Are hand dryers effective in killing the new coronavirus? HTTPURL</s>11. The new coronavirus CANNOT be transmitted through mosquito bites. HTTPURL</s>12. Taking a hot bath does not prevent the new coronavirus disease HTTPURL</s>13. Cold weather and snow CANNOT kill the new coronavirus. HTTPURL</s>14. COVID-19 virus can be transmitted in areas with hot and humid climates HTTPURL</s>15. Drinking alcohol does not protect y

In [26]:
from torch.utils.data import Dataset
class TweetDataset(Dataset):
    def __init__(self, tweet_list, label_list, tokenizer):
        self.tweet_list = tweet_list
        self.label_list = label_list
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tweet_list)

    def __getitem__(self, idx):
        return self.tweet_list[idx], self.label_list[idx]

    def collate_fn(self, data):
        tweets = []
        labels = []
        for text, label in data:
            tweets.append(text)
            labels.append(label)
        tokens = self.tokenizer(tweets, max_length=512, padding=True, truncation=True, return_tensors="pt")

        data_dict = {
        "input_ids": tokens["input_ids"],
        "attn_mask": tokens["attention_mask"],
        "label": torch.tensor(labels).long()
        }
        return data_dict

train_set = TweetDataset(train_text, train_label, tokenizer)
dev_set = TweetDataset(dev_text, dev_label, tokenizer)

In [27]:
class testDataset(Dataset):
    def __init__(self, tweet_list, tokenizer):
        self.tweet_list = tweet_list
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tweet_list)

    def __getitem__(self, idx):
        return self.tweet_list[idx]

    def collate_fn(self, data):
        tweets = []
        for text in data:
            tweets.append(text)
        tokens = self.tokenizer(tweets, max_length=512, padding=True, truncation=True, return_tensors="pt")

        data_dict = {
        "input_ids": tokens["input_ids"],
        "attn_mask": tokens["attention_mask"]
        }
        return data_dict

test_set = testDataset(test_text, tokenizer)

creat cls model

In [19]:
from torch import nn
from transformers import AutoModel


class RumourCLS(nn.Module):
    def __init__(self, pre_encoder):

        super(RumourCLS, self).__init__()
        self.encoder = AutoModel.from_pretrained(pre_encoder)
        hidden_size = self.encoder.config.hidden_size
        self.cls = nn.Linear(hidden_size, 2)

    def forward(self, reps, masks):
        texts_emb = self.encoder(input_ids=reps, attention_mask=masks).last_hidden_state
        # first token
        texts_emb = texts_emb[:, 0, :]
        logits = self.cls(texts_emb)

        return logits

RumourCLS = RumourCLS(pre_trained_weights)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AdamW
from pytorch_lightning import LightningModule
from pathlib import Path
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support


class RumourDetection(LightningModule):
    def __init__(self, pre_trained_weights):

        """Initialize model and tokenizer."""
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_weights)
        self.cls = RumourCLS(pre_trained_weights)

        self.output_dir = Path("modelData")
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.metrics = dict()
        self.metrics["val"] = []
        self.val_metric = "f1_score"

        # -1 mean use all
        n_observations_per_split = {
            "train": -1,
            "val": -1,
            "test": -1,
        }
        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}

    def re_init(self):

        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_weights)

        self.output_dir = Path("modelData")
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.metrics = dict()
        self.metrics["val"] = []
        self.val_metric = "f1_score"

        n_observations_per_split = {
            "train": -1,
            "val": -1,
            "test": -1,
        }
        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}

    def total_steps(self) -> int:
        """The number of total training steps that will be run. Used for lr scheduler purposes."""
        return (len(self.train_dataloader().dataset) / 16) * 100

    def configure_optimizers(self):
        """Prepare Adafactor optimizer and schedule"""
        no_decay = ['bias', 'LayerNorm.weight']
        encoder_parameters = [
            {'params': [p for n, p in self.cls.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 1e-3},
            {'params': [p for n, p in self.cls.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]

        encoder_optimizer = AdamW(encoder_parameters, lr=5e-5)

        total_steps = int(self.total_steps())
        encoder_scheduler = get_cosine_schedule_with_warmup(encoder_optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)
        encoder_scheduler = {"scheduler": encoder_scheduler, "interval": "step", "frequency": 1}
        return [encoder_optimizer], [encoder_scheduler]

    def training_step(self, batch, batch_idx):
        logits = self.cls(batch["input_ids"], batch["attn_mask"])
        labels = batch["label"]
        loss = torch.nn.functional.cross_entropy(logits, labels)
        if batch_idx % 20 == 0:
            self.logger.log_metrics({"loss": loss.item()})
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.cls(batch["input_ids"], batch["attn_mask"])
        preds = torch.argmax(logits, dim=1).tolist()
        labels = batch["label"]
        return {"preds": preds, "labels": labels.tolist()}

    def test_step(self, batch, batch_idx):
        logits = self.cls(batch["input_ids"], batch["attn_mask"])
        preds = torch.argmax(logits, dim=1).tolist()
        return {"preds": preds}

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        n_obs = self.n_obs[type_path]
        if type_path == "train":
            dataset = train_set
            return DataLoader(dataset,batch_size=batch_size,collate_fn=dataset.collate_fn,shuffle=True,
                              num_workers=16,sampler=None,drop_last=True)
        elif type_path == "val":
            dataset = dev_set
            return DataLoader(dataset,batch_size=batch_size,collate_fn=dataset.collate_fn,shuffle=shuffle,
                              num_workers=16,sampler=None)
        else:
            dataset = test_set
            return DataLoader(
                dataset,
                batch_size=batch_size,
                collate_fn=dataset.collate_fn,
                shuffle=shuffle,
                num_workers=16,
                sampler=None,
            )

    def train_dataloader(self):
        return self.get_dataloader("train", batch_size=16, shuffle=True)

    def val_dataloader(self):
        return self.get_dataloader("val", batch_size=16)

    def test_dataloader(self):
        return self.get_dataloader("test", batch_size=16)

    def validation_epoch_end(self, outputs):
        preds = []
        labels = []
        for x in outputs:
            for pred, label in zip(x["preds"], x["labels"]):
                preds.append(pred)
                labels.append(label)

        p, r, f, _ = precision_recall_fscore_support(labels, preds, pos_label=1, average="binary")
        self.log(self.val_metric, f, logger=False)

        val_metrics = dict()
        val_metrics[f"val_precision"] = p
        val_metrics[f"val_recall"] = r
        val_metrics[f"val_f1_score"] = f
        self.logger.log_metrics(val_metrics)
        self.metrics["val"].append(val_metrics)

    def test_epoch_end(self, outputs):
        preds = []
        for x in outputs:
            for pred in x["preds"]:
                preds.append(pred)

        # Log results
        od = Path("modelData")
        # results_file = od / "test.predictions.csv"
        results_file = od / "submissions.csv"

        with open(results_file, "w") as writer:
            writer.write("Id,Predicted\n")
            for xid, x in enumerate(preds):
                writer.write(str(xid) + "," + str(x) + "\n")


In [None]:
#!/usr/bin/env python
import argparse
import random
import torch
from pathlib import Path
import pytorch_lightning as pl
from pl_model import RumourDetection
from utils import pickle_save
from callback import LoggingCallback


def add_model_specific_args(parser):
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--num_workers", default=8, type=int, help="kwarg passed to DataLoader")
    parser.add_argument("--warmup_ratio", default=0.1, type=float, help="warm up rate steps")
    parser.add_argument("--save_top_k", type=int, default=1, required=False, help="How many checkpoints to save")
    parser.add_argument("--train_batch_size", default=32, type=int)
    parser.add_argument("--eval_batch_size", default=32, type=int)
    parser.add_argument("--test_batch_size", default=32, type=int)

    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        dest="accumulate_grad_batches",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--max_length",
        default=60,
        type=int,
        help="The maximum sentence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded.",
    )

    parser.add_argument(
        "--early_stopping_patience",
        type=int,
        default=-1,
        required=False,
        help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.",
    )
    parser.add_argument("--pre_encoder", default="bert-base-uncased", help="pre-trained encoder")
    parser.add_argument("--wandb", action="store_true", default=False, help="Whether to use wandb.")
    parser.add_argument("--fp16", action="store_true", default=False)
    parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
    parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.")
    parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
    return parser


def train_model(model, args, logger):
    # init random seed
    pl.seed_everything(args.seed)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # add model checkpoints callback
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=args.output_dir, filename="{f1_score:.4f}", monitor="f1_score", mode="max", save_top_k=args.save_top_k
    )

    # add logging callback
    logging_callback = LoggingCallback()
    # lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
    extra_callbacks = [logging_callback, checkpoint_callback]
    # add early stop callback
    if args.early_stopping_patience > 0:
        early_stopping_callback = pl.callbacks.EarlyStopping(
            monitor="f1_score",
            mode="max",
            patience=args.early_stopping_patience,
            verbose=True,
        )
        extra_callbacks.append(early_stopping_callback)

    if args.fp16:
        precision = 16
    else:
        precision = 32

    trainer = pl.Trainer.from_argparse_args(
        args,
        weights_summary="top",
        callbacks=extra_callbacks,
        logger=logger,
        precision=precision,
    )

    trainer.fit(model)

    return trainer


def main(args):
    odir = Path(args.output_dir)
    odir.mkdir(exist_ok=True)

    # init model
    model = RumourDetection(args)

    # init logger
    if args.wandb:
        from pytorch_lightning.loggers import WandbLogger
        logger = WandbLogger(name="twitter", project="NLP")
    else:
        logger = True

    trainer = train_model(model, args, logger)

    pickle_save(model.hparams, model.hparams_save_path)
    trainer.test(verbose=False, ckpt_path="best")
    return model


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser = add_model_specific_args(parser)
    args = parser.parse_args()
    main(args)
