#### 1. Import libraries

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git@main 
!pip install evaluate

In [None]:
!pip install transformers==4.6.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification 
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

from datasets import load_dataset
import evaluate

from tqdm.notebook import tqdm

#### 2. Upload data and model

In [4]:
model_name_or_path = "ai-forever/rugpt3small_based_on_gpt2"

batch_size = 8
num_epochs = 3

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cuda


In [86]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # padding token

model = GPT2ForSequenceClassification.from_pretrained(model_name_or_path, num_labels = 2)
model.config.pad_token_id = model.config.eos_token_id

Some weights of the model checkpoint at ai-forever/rugpt3small_based_on_gpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ai-forever/rugpt3small_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [127]:
path_data = '/content/drive/MyDrive/ds/exps/'

dataset = load_dataset("csv", data_files={'train': path_data + 'data/balanced/train1000.csv', 
                                          'test': path_data + 'data/balanced/test100.csv',
                                          'val': path_data + 'data/balanced/val100.csv'})



  0%|          | 0/3 [00:00<?, ?it/s]

In [141]:
def lowercase_condition(example):
    return {"text": example["text"].lower()}

def tokenize_function(examples):
    outputs = tokenizer(examples["text"], truncation  = True, max_length = 256)
    return outputs

dataset = dataset.map(lowercase_condition)

tok_dataset = dataset.map(
    tokenize_function,
    remove_columns=["Unnamed: 0", 'text', 'Unnamed: 0.1']
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [170]:
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors="pt", padding = True)

torch.manual_seed(42)
    
train_dataloader = DataLoader(tok_dataset["train"], shuffle = True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(tok_dataset["val"], collate_fn=collate_fn, batch_size=batch_size)

#### 3. Train model

In [None]:
print(device)
print('\n')
model.to(device)

In [155]:
lr = 3e-4

optimizer = AdamW(params=model.parameters(), lr=lr)
metric = evaluate.load('accuracy')
cross_entropy_loss = nn.CrossEntropyLoss()

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.1 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [163]:
train_losses = []
val_losses = []

for epoch in range(num_epochs):

    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):

        batch.to(device)
        outputs = model(**batch)

        loss = cross_entropy_loss(outputs.logits, batch["labels"])
        train_losses.append(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)

        with torch.no_grad():
            outputs = model(**batch)

        loss = cross_entropy_loss(outputs.logits, batch["labels"])
        val_losses.append(loss)

        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

epoch 0: {'accuracy': 0.78}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

epoch 1: {'accuracy': 0.87}


  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

epoch 2: {'accuracy': 0.88}


#### 4. Test evaluation

In [204]:
def mertics_sklearn(y_true, pred):
    scores = [d[0] for d in pred]
    y_predict = list(map(lambda a: 0 if a >= 0.5 else 1, scores))

    acc = round(accuracy_score(y_true, y_predict), 3)
    f1 = round(f1_score(y_true, y_predict), 3)
    rec = round(recall_score(y_true, y_predict), 3)
    prec = round(precision_score(y_true, y_predict), 3)
    rocauc = round(roc_auc_score(y_true, scores), 3)

    return acc, f1, rec, prec, rocauc

def flatten(y_pred):
    y = [t.tolist() for t in y_pred]
    return [item for sublist in y for item in sublist]

In [190]:
torch.manual_seed(42)
test_dataloader = DataLoader(tok_dataset["test"], shuffle = False, collate_fn=collate_fn, batch_size = None)

In [199]:
y_pred = []

for step, batch in enumerate(tqdm(eval_dataloader)):
  batch.to(device)

  with torch.no_grad():
    outputs = model(**batch)

  predictions = outputs.logits.argmax(dim=-1)
  y_pred.append(outputs.logits)

  0%|          | 0/13 [00:00<?, ?it/s]

In [205]:
acc, f1, rec, prec, rocauc = mertics_sklearn(tok_dataset["test"]['labels'], flatten(y_pred))

In [206]:
acc, f1, rec, prec, rocauc

(0.56, 0.56, 0.56, 0.56, 0.399)

In [181]:
len(tok_dataset["test"]['labels'])

100

In [193]:
y = [t.tolist() for t in y_pred]


In [203]:
flatten(y_pred)

[[3.166597843170166, -2.9838109016418457],
 [-1.9773567914962769, 1.23115873336792],
 [-2.7876968383789062, 2.5185508728027344],
 [3.1653523445129395, -2.8775217533111572],
 [-0.39432293176651, -0.16396695375442505],
 [2.0057809352874756, -1.8706402778625488],
 [-3.693594217300415, 2.836235761642456],
 [-2.267045497894287, 1.8904924392700195],
 [2.2436532974243164, -2.119516611099243],
 [-4.365286827087402, 3.17181658744812],
 [2.213921546936035, -2.0652408599853516],
 [1.886372685432434, -2.0700759887695312],
 [2.9557442665100098, -2.788588523864746],
 [-3.8489115238189697, 3.0875673294067383],
 [-3.2017555236816406, 2.2204692363739014],
 [2.810243844985962, -2.8500473499298096],
 [3.1690196990966797, -3.0951409339904785],
 [-2.495817184448242, 2.0733232498168945],
 [-2.7562155723571777, 2.668018102645874],
 [3.4171786308288574, -3.226719856262207],
 [3.047377586364746, -2.9714159965515137],
 [-0.7775270342826843, 0.2225363850593567],
 [0.9994964003562927, -1.2274689674377441],
 [2.88

In [195]:
y

[[0, 1, 1, 0, 1, 0, 1, 1],
 [0, 1, 0, 0, 0, 1, 1, 0],
 [0, 1, 1, 0, 0, 1, 0, 0],
 [1, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 1, 0, 0, 0],
 [1, 1, 0, 0, 1, 1, 1, 0],
 [0, 0, 0, 0, 1, 1, 0, 0],
 [0, 0, 0, 1, 0, 0, 1, 0],
 [0, 0, 1, 1, 0, 1, 1, 1],
 [1, 1, 1, 0, 0, 1, 0, 0],
 [0, 1, 1, 1, 0, 1, 1, 1],
 [1, 0, 1, 1, 1, 0, 0, 0],
 [1, 0, 1, 1]]

In [207]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)