In [None]:
!pip install transformers

In [None]:
!pip install pyparsing

In [3]:
import torch
import argparse
import torch.nn as nn
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import cv2
#import tensorflow as tf
import json
import os
from transformers import BertModel, BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None) # 

In [4]:
# seed = 0
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')
# !unzip '/content/drive/MyDrive/CL CUp Data/Data.zip'

In [14]:
# # # ..  Датасет  .. # # #
def get_data_from_json(path: str) -> pd.DataFrame:
    with open(path, "r") as json_file:
        json_lines = (line for line in json_file)
        rows = (
            {
                "text": data["text"],
                "comment": comment["text"],
                "score": comment["score"]
            }
            for line in json_lines
            for data in [json.loads(line)]
            for comment in data["comments"]
        )
        return pd.DataFrame(rows, columns=["text", "comment", "score"])

path_curr = '/content'
path_to_train = ''.join([path_curr, '/ranking_train.jsonl'])
path_to_test = ''.join([path_curr, '/ranking_test.jsonl'])
testtrue = get_data_from_json(path_to_test)
traintrue = get_data_from_json(path_to_train)

train, temp = train_test_split(traintrue, train_size=0.04, test_size=0.96,random_state=0)
temp0, test = train_test_split(temp, train_size=0.9875, test_size=0.0125,random_state=0)


                            

In [16]:
# # # ..  Bert  .. # # #

# # .. Токенизация .. # #
model_name = 'bert-base-uncased'
torch.backends.cudnn.enabled = False
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 5)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Преобразование текста в Bert-cовместимых токены и добавление
# специальных [CLS] и [SEP] токенов, а также создание макси внимания,
# т.е. какие токены игнорировать
def MyTokenizerFunc(sample):
    input_ids = []
    attention_masks = []
    for text in sample['comment']:
        encoded_dict = tokenizer.encode_plus(text,
                                             add_special_tokens=True,
                                             max_length=128,
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    #Объединяем токены и маски внимания в единый список, создаем лист меток
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(sample['score'].values)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# Params and Data #
epochs = 10
batch_size = 100
lr = 2e-5

In [18]:


train_dataset = MyTokenizerFunc(train)
dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# torch.backends.cudnn.deterministic = True

In [None]:
# # .. Train .. # #
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(epochs):
    print('Train epoch is: ', epoch)
    count = 0
    for batch in dataloader:
        count += 1
        print('batch no. ', count)
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()


In [25]:
# # .. Test .. # #
test_dataset = MyTokenizerFunc(test)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)


In [28]:

model.eval()
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

    logits = outputs[1].detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()
    predicted_classes = np.argmax(logits, axis=1)

    test_accuracy += accuracy_score(predicted_classes, label_ids)
    nb_test_examples += len(label_ids)
    nb_test_steps += 1

print("Test Accuracy: {}".format(test_accuracy/nb_test_steps))

Test Accuracy: 0.2539405768813707
