In [1]:
!pip install transformers datasets;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 55.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 k

In [2]:
! pip3 install emoji==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 3.1 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49735 sha256=62e47ef6355751ebbae449ed00bdf66dc416de78b607dfdd2ebfeab8392b852d
  Stored in directory: /root/.cache/pip/wheels/43/3d/82/e7baffa5e86346c6178d7750dba6e8ef063282a37fc563f8f8
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import os
import pandas as pd

import torch
import torch.nn as nn

from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline, FeatureExtractionPipeline

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [5]:
def processed_tweet_list(lst):
  if len(lst)==0:
    return ['\n']
  else:
    return lst

def processed_des(des):
  if len(des)==0:
    return ['\n']
  else:
    return [des]

def get_dataset(path_to_data):
  with open(path_to_data) as f:
    data = json.loads(f.read())
  data = [[processed_des(x['description']) + processed_tweet_list(x['tweets']), 1 if x['label']=='bot' else 0, x['id']] for x in data]
  df = pd.DataFrame(data)
  df.columns = ["text", "label", "uid"]
  data = Dataset.from_pandas(df)
  return data

In [6]:
data_args = {'path_to_train' : '/content/drive/MyDrive/advNLP/twibot20/train.json',
             'path_to_test' : '/content/drive/MyDrive/advNLP/twibot20/test.json',
             'path_to_val' : '/content/drive/MyDrive/advNLP/twibot20/val.json'}

checkpoint = "vinai/bertweet-base"

In [7]:
from transformers import AutoTokenizer, AutoModel

max_length = 64
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)
device  = torch.device("cuda")
model = model.to(device)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
train_data = get_dataset(data_args['path_to_train'])
val_data = get_dataset(data_args['path_to_val'])
test_data = get_dataset(data_args['path_to_test'])

data = DatasetDict({
    'train': train_data,
    'test': test_data,
    'valid': val_data})

In [9]:
data = data.map(lambda x: tokenizer(x['text'], max_length=max_length, padding=True, truncation=True), batched=False)

  0%|          | 0/8278 [00:00<?, ?ex/s]

  0%|          | 0/1183 [00:00<?, ?ex/s]

  0%|          | 0/2365 [00:00<?, ?ex/s]

In [10]:
def process_example(example):
  input_ids = torch.tensor(example['input_ids']).to(device)
  attention_mask = torch.tensor(example['attention_mask'], dtype=torch.long).to(device)
  with torch.no_grad():
    output = model(input_ids, attention_mask)
  return {'features' : output.pooler_output.cpu(), 'label' : example['label'], "uid" : example["uid"]}

In [15]:
data = data.map(process_example, batched=False)

  0%|          | 0/8278 [00:00<?, ?ex/s]

  0%|          | 0/1183 [00:00<?, ?ex/s]

  0%|          | 0/2365 [00:00<?, ?ex/s]

In [16]:
for split, dataset in data.items():
    dataset.to_json(f'/content/drive/MyDrive/advNLP/twibot20/{split}_bertweet_emb.json')

Creating json from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]