In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

In [3]:
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")

In [4]:
!python -V

Python 3.9.12


In [5]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [6]:
print(embed_bert_cls('привет мир', model, tokenizer).shape)

(312,)


In [7]:
tz = embed_bert_cls('привет мир', model, tokenizer)

In [8]:
df = pd.read_csv("/Users/vyacheslav/Documents/projects/rubert-tiny/notebooks/yes_no_corpus.v2.txt", sep="\t", names=['YESNO', 'DATA'])

In [9]:
df['TARGET'] = df['YESNO'].map({"NO": 0, "YES":1})

In [10]:
df['VEC'] = df['DATA'].apply(lambda  x : embed_bert_cls(x, model, tokenizer)) 

In [11]:
df.head(1)

Unnamed: 0,YESNO,DATA,TARGET,VEC
0,NO,а нет,0,"[0.0030167643, 0.031472478, -0.01533914, -0.07..."


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 763 entries, 0 to 762
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   YESNO   763 non-null    object
 1   DATA    763 non-null    object
 2   TARGET  763 non-null    int64 
 3   VEC     763 non-null    object
dtypes: int64(1), object(3)
memory usage: 24.0+ KB


In [13]:

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [14]:
class YesNoModel(nn.Module):
    def set_config(self, config):
        self.config = config

    def predict_group(self, group, txt):
        for t in self.config.get(group, []):
            if t in txt:
                return 1
            if txt in t:
                return 1
        return 0

    def __init__(self):
        super().__init__()
        # 2. Create 2 nn.Linear layers capable of handling X and y input and output shapes
        self.layer_1 = nn.Linear(in_features=312, out_features=5)  # takes in 2 features (X), produces 5 features
        self.layer_2 = nn.Linear(in_features=5, out_features=1)  # takes in 5 features, produces 1 feature (y)

    # 3. Define a forward method containing the forward pass computation
    def forward(self, x):
        # Return the output of layer_2, a single feature, the same shape as y
        return self.layer_2(
            self.layer_1(
                x))  # computation goes through layer_1 first then the output of layer_1 goes through layer_2

# 4. Create an instance of the model and send it to target device
model_0 = YesNoModel().to(device)
model_0

YesNoModel(
  (layer_1): Linear(in_features=312, out_features=5, bias=True)
  (layer_2): Linear(in_features=5, out_features=1, bias=True)
)

In [15]:
loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss = sigmoid built-in

# Create an optimizer
optimizer = torch.optim.SGD(params=model_0.parameters(), 
                            lr=0.1)

In [16]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [17]:
type(df['VEC'].values[0])

numpy.ndarray

In [18]:


X_train = torch.from_numpy(np.stack(df['VEC'].values)).type(torch.float)
y_train= torch.from_numpy(np.stack(df['TARGET'].values)).type(torch.float)
X_test = X_train
y_test = y_train
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

In [19]:
# Build training and evaluation loop
torch.manual_seed(42)
epochs=1000
for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model_0(X_train).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls
  
    # 2. Calculate loss/accuracy
    # loss = loss_fn(torch.sigmoid(y_logits), # Using nn.BCELoss you need torch.sigmoid()
    #                y_train) 
    loss = loss_fn(y_logits, # Using nn.BCEWithLogitsLoss works with raw logits
                   y_train) 
    acc = accuracy_fn(y_true=y_train, 
                      y_pred=y_pred) 

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.69635, Accuracy: 36.30% | Test loss: 0.69341, Test acc: 48.49%
Epoch: 10 | Loss: 0.67358, Accuracy: 63.30% | Test loss: 0.67194, Test acc: 63.30%
Epoch: 20 | Loss: 0.66021, Accuracy: 63.30% | Test loss: 0.65917, Test acc: 63.30%
Epoch: 30 | Loss: 0.65123, Accuracy: 63.30% | Test loss: 0.65046, Test acc: 63.30%
Epoch: 40 | Loss: 0.64416, Accuracy: 63.30% | Test loss: 0.64350, Test acc: 63.30%
Epoch: 50 | Loss: 0.63764, Accuracy: 63.30% | Test loss: 0.63699, Test acc: 63.30%
Epoch: 60 | Loss: 0.63089, Accuracy: 63.30% | Test loss: 0.63018, Test acc: 63.30%
Epoch: 70 | Loss: 0.62340, Accuracy: 63.30% | Test loss: 0.62259, Test acc: 63.30%
Epoch: 80 | Loss: 0.61478, Accuracy: 63.30% | Test loss: 0.61384, Test acc: 63.30%
Epoch: 90 | Loss: 0.60472, Accuracy: 63.30% | Test loss: 0.60363, Test acc: 63.30%
Epoch: 100 | Loss: 0.59295, Accuracy: 63.30% | Test loss: 0.59167, Test acc: 63.30%
Epoch: 110 | Loss: 0.57924, Accuracy: 63.30% | Test loss: 0.57775, Test acc: 63.30%
Epo

In [20]:
new_df = pd.DataFrame()

In [21]:
new_df['DATA'] = ["думаю что нет"]

In [22]:
new_df.head()

Unnamed: 0,DATA
0,думаю что нет


In [23]:
new_df['VEC'] = new_df['DATA'].apply(lambda  x : embed_bert_cls(x, model, tokenizer)) 


In [24]:
X_train2 = torch.from_numpy(np.stack(new_df['VEC'].values)).type(torch.float)


In [25]:
test_logits = model_0(X_train2).squeeze() 
torch.round(torch.sigmoid(test_logits))

tensor(0., grad_fn=<RoundBackward0>)

In [26]:
type(model_0)

__main__.YesNoModel

In [27]:
def make_predict(text, model_0):
    new_df = pd.DataFrame()
    new_df['DATA'] = [text]
    new_df['VEC'] = new_df['DATA'].apply(lambda  x : embed_bert_cls(x, model, tokenizer)) 
    X_train2 = torch.from_numpy(np.stack(new_df['VEC'].values)).type(torch.float)
    test_logits = model_0(X_train2).squeeze() 
    return torch.round(torch.sigmoid(test_logits))
    

In [28]:
t1 = make_predict("думаю что да", model_0)

In [29]:
int(t1.tolist())

1

In [30]:
import pickle

In [31]:

file = open('model_0.picle', 'wb')

# dump information to that file
pickle.dump(model_0, file)

# close the file
file.close()

In [32]:
file = open('tokenizer.picle', 'wb')

# dump information to that file
pickle.dump(tokenizer, file)

# close the file
file.close()

In [33]:
file = open('model_rubertyni.picle', 'wb')

# dump information to that file
pickle.dump(model, file)

# close the file
file.close()

In [34]:
torch.save(model_0.state_dict(), "model_0.torch")