#### 1. Import libraries

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git@main 
!pip install evaluate

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, WeightedRandomSampler

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification 
from transformers import get_cosine_schedule_with_warmup

import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

from datasets import load_dataset
import evaluate

from tqdm.notebook import tqdm

#### 2. Upload data and model

In [2]:
model_name_or_path = "ai-forever/rugpt3medium_based_on_gpt2"

batch_size = 8
num_epochs = 3

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cuda


In [3]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # padding token

model = GPT2ForSequenceClassification.from_pretrained(model_name_or_path, num_labels = 2)
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer)) # for medium, large

Some weights of the model checkpoint at ai-forever/rugpt3medium_based_on_gpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ai-forever/rugpt3medium_based_on_gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50258, 1024)

In [4]:
path_data = '/content/drive/MyDrive/ds/exps/LLM/'


dataset = load_dataset("csv", data_files={'train': path_data + 'data/train2000.csv', 
                                           'test': path_data + 'data/test500.csv',
                                           'val': path_data + 'data/val500.csv'})

# dataset = load_dataset("csv", data_files={'train': path_data + 'data/balanced/train1000.csv', 
#                                           'test': path_data + 'data/balanced/test100.csv',
#                                           'val': path_data + 'data/balanced/val100.csv'})




  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def lowercase_condition(example):
    return {"text": example["text"].lower()}

def tokenize_function(examples):
    outputs = tokenizer(examples["text"], truncation  = True, max_length = 256)
    return outputs

dataset = dataset.map(lowercase_condition)

tok_dataset = dataset.map(
    tokenize_function,
    remove_columns=["Unnamed: 0", 'text', 'Unnamed: 0.1']
    #remove_columns=["Unnamed: 0", 'text']
)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [6]:
from torch.utils.data import WeightedRandomSampler

In [7]:
# calculating class weight
class_weights = torch.Tensor([1 - tok_dataset["train"]['labels'].count(0) / len(tok_dataset["train"]['labels']), \
1 - tok_dataset["train"]['labels'].count(1) / len(tok_dataset["train"]['labels'])]).to(device)

example_weights = [class_weights[e] for e in tok_dataset["train"]['labels']]
sampler = WeightedRandomSampler(example_weights, len(tok_dataset["train"]['labels']))

In [8]:
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors="pt", padding = True)

torch.manual_seed(42)
    
#train_dataloader = DataLoader(tok_dataset["train"], shuffle = True, collate_fn=collate_fn, batch_size=batch_size)
train_dataloader = DataLoader(tok_dataset["train"], sampler = sampler, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(tok_dataset["val"], collate_fn=collate_fn, batch_size=batch_size)

#### 3. Train model

In [9]:
def metrics_sklearn(y_true, pred):

    pred = torch.cat(pred).cpu()

    softmax = nn.Softmax(dim=1)
    pred = softmax(pred)
    pred.tolist()
    y_predict = pred.argmax(dim=-1)

    acc = round(accuracy_score(y_true, y_predict), 3)
    f1 = round(f1_score(y_true, y_predict), 3)
    rec = round(recall_score(y_true, y_predict), 3)
    prec = round(precision_score(y_true, y_predict), 3)
    rocauc = round(roc_auc_score(y_true, pred[:,1]), 3)

    return acc, f1, rec, prec, rocauc, y_predict

In [10]:
print(device)
print('\n')
model.to(device)

cuda




GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=1024, out_features=2, bias=False)
)

In [11]:
lr = 3e-4

optimizer = AdamW(params=model.parameters(), lr=lr)
metric = evaluate.load('accuracy')
#cross_entropy_loss = nn.CrossEntropyLoss(weight = class_weights)
cross_entropy_loss = nn.CrossEntropyLoss()

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.1 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [12]:
train_losses = []
val_losses = []

for epoch in range(num_epochs):

    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):

        batch.to(device)
        outputs = model(**batch)

        loss = cross_entropy_loss(outputs.logits, batch["labels"])
        train_losses.append(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


    model.eval()
    y_pred = []

    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)

        with torch.no_grad():
            outputs = model(**batch)

        loss = cross_entropy_loss(outputs.logits, batch["labels"])
        val_losses.append(loss)

        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )
        y_pred.append(outputs.logits)

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

    acc, f1, rec, prec, rocauc, y_predict = metrics_sklearn(tok_dataset["val"]['labels'], y_pred)
    print(acc, f1, rec, prec, rocauc)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

epoch 0: {'accuracy': 0.88}
0.88 0.881 0.888 0.874 0.947


  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

epoch 1: {'accuracy': 0.868}
0.868 0.858 0.796 0.93 0.95


  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

epoch 2: {'accuracy': 0.878}
0.878 0.871 0.824 0.924 0.96


#### 4. Test evaluation

In [13]:
torch.manual_seed(42)
test_dataloader = DataLoader(tok_dataset["test"], shuffle = False, collate_fn=collate_fn, batch_size = 8)

In [14]:
y_pred = []

for step, batch in enumerate(tqdm(test_dataloader)):
  batch.to(device)

  with torch.no_grad():
    outputs = model(**batch)

  predictions = outputs.logits.argmax(dim=-1)
  y_pred.append(outputs.logits)

  0%|          | 0/63 [00:00<?, ?it/s]

In [15]:
acc, f1, rec, prec, rocauc, y_predict = metrics_sklearn(tok_dataset["test"]['labels'], y_pred)
param = sum(p.numel() for p in model.parameters())

In [16]:
y_predict.tolist().count(0), y_predict.tolist().count(1)

(272, 228)

In [18]:
comment = 'unb, sampler'
vers = 'medium'
inf = '11c'
ft = '2m50c'

In [19]:
path = 'drive/MyDrive/ds/exps/LLM/'
res = pd.read_csv(path + 'res.csv')
res.loc[len(res)] = ['rugpt', vers, param, comment, num_epochs, inf, ft, acc, f1, rec, prec, rocauc, batch_size, lr]
res

Unnamed: 0,model,version,size,comment,epochs,time_inf,time_ft_1e,accuracy,f1,precision,recall,roc_auc,batch,lr
0,rugpt,small,125233152,"balanced data, 1000tr, 100te",3,1c,25c,0.83,0.828,0.82,0.837,0.895,8,0.0003
1,rugpt,medium,355874816,"balanced data, embed layer + 1",3,3c,1.20m,0.87,0.871,0.88,0.863,0.937,8,0.0003
2,rugpt,large,760304640,out of memory,3,,,,,,,,8,0.0003
3,rugpt,small,125228544,"unbalanced data, 2000tr, 500te, cross-entropy ...",3,2c,53c,0.872,0.864,0.816,0.919,0.94,8,0.0003
4,rugpt,medium,355874816,unbalanced data,3,10c,2m47c,0.88,0.877,0.852,0.903,0.957,8,0.0003
5,rugpt,small,125228544,"unbalanced data, CELoss weighted",3,4c,57c,0.882,0.876,0.836,0.921,0.952,8,0.0003
6,rugpt,medium,355874816,"unbalanced data, CELoss weighted",3,11c,2m54c,0.872,0.866,0.828,0.908,0.953,8,0.0003
7,rugpt,small,125228544,"unb, sampler",3,3c,54c,0.858,0.85,0.804,0.901,0.95,8,0.0003
8,rugpt,medium,355874816,"unb, sampler",3,11c,2m50c,0.884,0.879,0.84,0.921,0.952,8,0.0003


In [20]:
res.to_csv(path + 'res.csv',  index = False)

In [21]:
res.sort_values(by ='version')

Unnamed: 0,model,version,size,comment,epochs,time_inf,time_ft_1e,accuracy,f1,precision,recall,roc_auc,batch,lr
2,rugpt,large,760304640,out of memory,3,,,,,,,,8,0.0003
1,rugpt,medium,355874816,"balanced data, embed layer + 1",3,3c,1.20m,0.87,0.871,0.88,0.863,0.937,8,0.0003
4,rugpt,medium,355874816,unbalanced data,3,10c,2m47c,0.88,0.877,0.852,0.903,0.957,8,0.0003
6,rugpt,medium,355874816,"unbalanced data, CELoss weighted",3,11c,2m54c,0.872,0.866,0.828,0.908,0.953,8,0.0003
8,rugpt,medium,355874816,"unb, sampler",3,11c,2m50c,0.884,0.879,0.84,0.921,0.952,8,0.0003
0,rugpt,small,125233152,"balanced data, 1000tr, 100te",3,1c,25c,0.83,0.828,0.82,0.837,0.895,8,0.0003
3,rugpt,small,125228544,"unbalanced data, 2000tr, 500te, cross-entropy ...",3,2c,53c,0.872,0.864,0.816,0.919,0.94,8,0.0003
5,rugpt,small,125228544,"unbalanced data, CELoss weighted",3,4c,57c,0.882,0.876,0.836,0.921,0.952,8,0.0003
7,rugpt,small,125228544,"unb, sampler",3,3c,54c,0.858,0.85,0.804,0.901,0.95,8,0.0003


#### 5. Conclusion

- Для Largw modekl не хватает ресурсов. 
- Small модель показала наилучший ррезультат для weighted CELoss.
- Medium  - с использованием sampler.

#### 6. Extra calculations

In [29]:
# how batched are built with sampler

arr_batch = []
for step, batch in enumerate(train_dataloader):
  arr_batch += batch['labels'].tolist()  

arr_batch.count(0), arr_batch.count(1)

(971, 1029)