In [None]:
!pip install transformers datasets;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 56.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.0 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 104.1 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Colle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os
import pandas as pd

import torch
import torch.nn as nn

from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline, FeatureExtractionPipeline

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
def processed_tweet_list(lst):
  if len(lst)==0:
    return ['\n']
  else:
    return lst

def processed_des(des):
  if len(des)==0:
    return ['\n']
  else:
    return [des]

def get_dataset(path_to_data):
  with open(path_to_data) as f:
    data = json.loads(f.read())
  data = [[processed_des(x['description']) + processed_tweet_list(x['tweets']), 1 if x['label']=='bot' else 0, x['id']] for x in data]
  df = pd.DataFrame(data)
  df.columns = ["text", "label", "uid"]
  data = Dataset.from_pandas(df)
  return data

In [None]:
data_args = {'path_to_train' : '/content/drive/MyDrive/advNLP/twibot20/train.json',
             'path_to_test' : '/content/drive/MyDrive/advNLP/twibot20/test.json',
             'path_to_val' : '/content/drive/MyDrive/advNLP/twibot20/val.json'}

checkpoint = "roberta-base"

In [None]:
from transformers import AutoTokenizer, AutoModel

max_length = 128
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)
device  = torch.device("cuda")
model = model.to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_data = get_dataset(data_args['path_to_train'])
val_data = get_dataset(data_args['path_to_val'])
test_data = get_dataset(data_args['path_to_test'])

data = DatasetDict({
    'train': train_data,
    'test': test_data,
    'valid': val_data})

In [None]:
data = data.map(lambda x: tokenizer(x['text'], max_length=max_length, padding=True, truncation=True), batched=False)

  0%|          | 0/8278 [00:00<?, ?ex/s]

  0%|          | 0/1183 [00:00<?, ?ex/s]

  0%|          | 0/2365 [00:00<?, ?ex/s]

In [None]:
def process_example(example):
  input_ids = torch.tensor(example['input_ids']).to(device)
  attention_mask = torch.tensor(example['attention_mask'], dtype=torch.long).to(device)
  with torch.no_grad():
    output = model(input_ids, attention_mask)
  return {'features' : output.pooler_output.cpu(), 'label' : example['label'], "uid" : example["uid"]}

In [None]:
data = data.map(process_example, batched=False)

  0%|          | 0/8278 [00:00<?, ?ex/s]

  0%|          | 0/1183 [00:00<?, ?ex/s]

  0%|          | 0/2365 [00:00<?, ?ex/s]

In [None]:
for split, dataset in data.items():
    dataset.to_json(f'/content/drive/MyDrive/advNLP/twibot20/{split}_roberta_emb.json')