In [1]:
!pip install transformers
from torchvision import datasets, transforms
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, BertConfig
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import ndcg_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1
cpu


In [2]:
train = pd.read_json('/content/drive/MyDrive/CUP_IT/ranking_train.jsonl', lines=True)

In [3]:
train = train.explode(column='comments')
train['comment'] = train.comments.map(lambda dic: dic['text'])
train['score'] = train.comments.map(lambda dic: dic['score'])
train.drop(labels=['comments'], axis=1, inplace=True)
train['full_text'] = train.text + ' |||| ' + train.comment
train.head()

Unnamed: 0,text,comment,score,full_text
0,How many summer Y Combinator fundees decided n...,Going back to school is not identical with giv...,0,How many summer Y Combinator fundees decided n...
0,How many summer Y Combinator fundees decided n...,There will invariably be those who don't see t...,1,How many summer Y Combinator fundees decided n...
0,How many summer Y Combinator fundees decided n...,For me school is a way to be connected to what...,2,How many summer Y Combinator fundees decided n...
0,How many summer Y Combinator fundees decided n...,I guess it really depends on how hungry you ar...,3,How many summer Y Combinator fundees decided n...
0,How many summer Y Combinator fundees decided n...,I know pollground decided to go back to school...,4,How many summer Y Combinator fundees decided n...


In [4]:
features = torch.load('/content/drive/MyDrive/CUP_IT/NEW_BERT_train_tensors/440535.pt')
len(features)

440535

In [5]:
target = torch.tensor(train['score'].astype(np.int32).values).type(torch.LongTensor)

In [6]:
b = torch.Tensor(len(features), 1, 128)
torch.cat(features, out=b)
train = torch.utils.data.TensorDataset(b, target)

  torch.cat(features, out=b)


In [7]:
# выделим из тренировочного датасета 20% данных для валидации
valid_ratio = 0.2
num_valid =  int(valid_ratio * len(train))
num_train = len(train) - num_valid

train_dataset, valid_dataset = random_split(dataset=train, lengths=[num_train, num_valid])

In [8]:
batch_size = 100
train_loader = DataLoader(train_dataset, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

In [9]:
def train_func(model, loader, loss_function, optimizer, scheduler):
    model.train()

    epoch_loss = 0

    for inputs in loader:
        inputs, targets = inputs[0].float().to(DEVICE), inputs[1].to(DEVICE)
        outputs = model(inputs)

        loss = loss_function(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    scheduler.step(epoch_loss)
    return epoch_loss / len(loader)

In [10]:
def test(model, loader, loss_function):
    with torch.no_grad():
        model.eval()
        N = 0
        total_loss = 0.0
        correct = 0.0
        for i, inputs in enumerate(loader):
            inputs, targets = inputs[0].float().to(DEVICE), inputs[1].to(DEVICE)
            outputs = model(inputs)
            N += inputs.shape[0]
            loss = loss_function(outputs, targets)
            total_loss += inputs.shape[0] * loss.item()
            predicted_targets = outputs.argmax(dim=1)
            correct += (predicted_targets == targets).sum().item()
        return total_loss / N, correct / N

In [11]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.dropout = nn.Dropout()
        self.relu = nn.ReLU()
        self.norm = nn.BatchNorm1d(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.fc3 = nn.LogSoftmax()
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.norm(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

In [12]:
input_size = 128
hidden_size = 64
num_classes = 5
#model = BertModel.from_pretrained("bert-base-uncased", num_labels = 5, output_attentions = False, output_hidden_states = False).to(DEVICE)

model = NeuralNet(input_size, hidden_size, num_classes).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=.01, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

In [13]:
num_epochs = 100
train_losses = []
val_losses = []
for epoch in range(num_epochs):
    train_loss = train_func(model, loader=train_loader, loss_function=criterion, optimizer=optimizer, scheduler=scheduler)
    val_loss, val_acc = test(model=model, loader=valid_loader, loss_function=criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f'Epoch {epoch}')
    print(f'Train Loss {train_loss:.6f} Val Loss {val_loss:.6f} Val Accuracy {val_acc:.6f}')
    print('---')

#test_loss, test_acc = test(model=model, loader=test_loader, loss_function=criterion)
#print(f'Test Loss {test_loss:.6f} Test Accuracy {test_acc:.6f}')

  out = self.fc3(out)


Epoch 0
Train Loss 1.578087 Val Loss 1.566165 Val Accuracy 0.283576
---
Epoch 1
Train Loss 1.579450 Val Loss 1.569992 Val Accuracy 0.281578
---
Epoch 2
Train Loss 1.579994 Val Loss 1.570094 Val Accuracy 0.281737
---
Epoch 3
Train Loss 1.580415 Val Loss 1.567750 Val Accuracy 0.282112
---
Epoch 4
Train Loss 1.580070 Val Loss 1.573121 Val Accuracy 0.279910
---
Epoch 5
Train Loss 1.580541 Val Loss 1.570481 Val Accuracy 0.281487
---
Epoch 6
Train Loss 1.579837 Val Loss 1.574265 Val Accuracy 0.276278
---
Epoch 7
Train Loss 1.579937 Val Loss 1.572906 Val Accuracy 0.279649
---
Epoch 8
Train Loss 1.580500 Val Loss 1.572729 Val Accuracy 0.281442
---
Epoch 9
Train Loss 1.580640 Val Loss 1.570262 Val Accuracy 0.281975
---
Epoch 10
Train Loss 1.580332 Val Loss 1.571088 Val Accuracy 0.281487
---
Epoch 11
Train Loss 1.581025 Val Loss 1.567444 Val Accuracy 0.282066
---
Epoch 12
Train Loss 1.570645 Val Loss 1.564916 Val Accuracy 0.284699
---
Epoch 13
Train Loss 1.569619 Val Loss 1.563933 Val Accuracy 0

In [14]:
for i in train_loader:
  outputs = model(i[0])
        # the class with the highest energy is what we choose as prediction
  _, predicted = torch.max(outputs.data, 1)
  print(i[1])
  print(round(ndcg_score([np.asarray(i[1])], [np.asarray(predicted)]), 2))
  break

tensor([2, 2, 0, 3, 4, 0, 1, 1, 1, 4, 4, 3, 1, 3, 3, 0, 0, 1, 3, 2, 3, 2, 1, 4,
        1, 1, 2, 1, 4, 4, 1, 1, 3, 3, 4, 4, 2, 4, 0, 0, 0, 2, 3, 2, 0, 3, 3, 0,
        2, 1, 2, 3, 0, 0, 4, 1, 0, 3, 0, 4, 1, 3, 1, 1, 4, 0, 4, 4, 0, 1, 1, 3,
        4, 2, 1, 1, 1, 1, 4, 2, 1, 3, 2, 4, 1, 3, 4, 2, 2, 1, 1, 3, 0, 1, 3, 2,
        0, 3, 4, 4])
0.86


  out = self.fc3(out)


In [26]:
test_embeddings = torch.load('/content/drive/MyDrive/CUP_IT/NEW_BERT_test_tensors/10000.pt')

test_tensor = torch.Tensor(len(test_embeddings), 1, 128)
torch.cat(test_embeddings, out=test_tensor)

  torch.cat(test_embeddings, out=test_tensor)


tensor([[  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        [  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        [  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        ...,
        [  101.,  1045.,  8046.,  ...,     0.,     0.,     0.],
        [  101.,  1045.,  8046.,  ...,     0.,     0.,     0.],
        [  101.,  1045.,  8046.,  ...,  2018.,  2227.,   102.]])

In [34]:
test_paths = ['/content/drive/MyDrive/CUP_IT/NEW_BERT_test_tensors/70000.pt', '/content/drive/MyDrive/CUP_IT/NEW_BERT_test_tensors/70020.pt']

In [35]:
test_all = []

for path in test_paths:
  temp = torch.load(path)
  for i in temp:
    test_all.append(i)

In [37]:
test_tensor = torch.Tensor(len(test_all), 1, 128)
torch.cat(test_all, out=test_tensor)

  torch.cat(test_all, out=test_tensor)


tensor([[  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        [  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        [  101., 16380.,  1022.,  ...,     0.,     0.,     0.],
        ...,
        [  101.,  2339.,  2515.,  ...,     0.,     0.,     0.],
        [  101.,  2339.,  2515.,  ...,     0.,     0.,     0.],
        [  101.,  2339.,  2515.,  ...,     0.,     0.,     0.]])

In [39]:
outputs = model(test_tensor)

_, predicted = torch.max(outputs, 1)

  out = self.fc3(out)


In [40]:
predicted

tensor([4, 0, 4,  ..., 4, 0, 4])

In [44]:
import pandas as pd
filepath_to_json = '/content/drive/MyDrive/CUP_IT/ranking_test.jsonl'
test = pd.read_json(filepath_to_json, lines=True)

In [45]:
test = test.explode(column='comments')
test['comment'] = test.comments.map(lambda dic: dic['text'])
test['score'] = test.comments.map(lambda dic: dic['score'])
test.shape

(70020, 4)

In [67]:
test['score'] = predicted
ungrouped_table = pd.DataFrame(test.groupby(test.index))
ungrouped_table[1] = ungrouped_table[1].apply(lambda ser: {
        'text': ser['text'].unique()[0],
        'comments': [
                {'text': comment, 'score': score}
                for comment, score in zip(ser['comment'], ser['score'])
        ]
    } 
)
grouped_table = pd.DataFrame(ungrouped_table[1].tolist())
with open('/content/drive/MyDrive/CUP_IT/output_grouped.jsonl', "w", encoding='utf-8') as f:
    f.write(grouped_table.to_json(orient='records', lines=True))

In [68]:
grouped_table

Unnamed: 0,text,comments
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",[{'text': 'I&#x27;m still waiting for them to ...
1,Ask HN: How do US HNers get their health insur...,[{'text': 'Get it from your employer. It&#x27;...
2,San Diego Researcher Crowdfunding Patent-Free ...,[{'text': 'What I don&#x27;t understand is why...
3,Rethinking the origins of the universe,[{'text': 'I&#x27;m not a physicist. I imagin...
4,SlackTextViewController: A new growing text in...,[{'text': 'As someone that doesn&#x27;t do iOS...
...,...,...
13999,The cat's miaow,"[{'text': 'Meanwhile in the US, Stubbs has bee..."
14000,Facebook’s Piracy Problem,[{'text': 'A radical idea: Maybe our model of ...
14001,Go GC: Solving the Latency Problem in Go 1.5,[{'text': 'Was the presentation more in-depth ...
14002,Understanding Neural Networks Through Deep Vis...,[{'text': 'Ok now I want to &quot;hear&quot; o...
