In [1]:
# mean pooling попробовать среднее арифметическое для всех векторов слов
# CLS токен
# 2 линейных слоя с ReLu по центру

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

In [3]:
class FurnitureDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        inputs = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'isFurniture': torch.tensor(label, dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)


In [84]:
# Base bert + linear head

class BertLinear(nn.Module):
    def __init__(self, bert_model, class_count=2, freez_bert=True):
        super().__init__()
        self.bert = bert_model
        self.bert.requires_grad_(not freez_bert)

        # self.head = nn.Linear(312, 2)
        self.head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(312, 256),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(256, 2)
        )
        # self.head = nn.Sequential(
        #     nn.Dropout(0.1),
        #     nn.Linear(312, 2)
        # )

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask)
        cls = output.last_hidden_state[:, 0, :]
        #cls = output.pooler_output
        return self.head(cls)


In [None]:
with torch.no_grad():
    print(base_bert(**t))

In [133]:
df.columns

Index(['Unnamed: 0', 'url_id', 'label', 'isFurniture', 'label_len'], dtype='object')

In [6]:
df = pd.read_csv(r'/content/train_label_final.csv')

X_train, X_test, y_train, y_test = train_test_split(df['label'].to_list(), df['isFurniture'].to_list(), test_size=0.30, random_state=100)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.30, random_state=100)

In [8]:
print(len(X_train), len(X_val), len(X_test))

720 216 93


In [115]:
def test_model(model, test_data_loader, criterium):

    all_labels = []
    all_preds = []
    total_test_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_data_loader):
            isFurniture = batch.pop("isFurniture").to(device)

            logits = model(**{k: v.to(device) for k, v in batch.items()})
            loss = criterium(logits, isFurniture)
            prediction = logits.argmax(dim=-1)

            all_preds.extend(prediction.cpu().numpy())
            all_labels.extend(isFurniture.cpu().numpy())
            total_test_loss += loss.item()

        avg_test_loss = total_test_loss / len(test_data_loader)

        print(f"Final Loss: {avg_test_loss:.4f}")
        report = classification_report(
                all_labels,
                all_preds,
                target_names=['Not Furniture', 'Furniture'],
                digits=4
            )

        print("\nClassification Report:")
        print(report)

In [222]:
def train(model, train_data_loader, val_data_loader, criterium, optimizer, epochs=5, isEval=True):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_pred = 0
        correct_pred = 0

        for batch in tqdm(train_data_loader):
            isFurniture = batch.pop("isFurniture").to(device)

            model.zero_grad()

            logits = model(**{k: v.to(device) for k, v in batch.items()})

            prediction = logits.argmax(dim=-1)
            total_pred += len(prediction)
            correct_pred += (prediction == isFurniture).sum().item()

            loss = criterium(logits, isFurniture)
            total_loss += loss.item()

            loss.backward()

            optimizer.step()

        avg_train_loss = total_loss / len(train_data_loader)
        print(f"Epoch {epoch+1} | Average Train Loss: {avg_train_loss:.4f}")
        accuracy = correct_pred / total_pred
        print(f"Train Accuracy: {accuracy:.2%}")

        if not isEval:
            continue

        model.eval()

        all_labels = []
        all_preds = []

        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_data_loader):
                isFurniture = batch.pop("isFurniture").to(device)

                logits = model(**{k: v.to(device) for k, v in batch.items()})
                loss = criterium(logits, isFurniture)
                prediction = logits.argmax(dim=-1)
                total_val_loss += loss.item()

                all_preds.extend(prediction.cpu().numpy())
                all_labels.extend(isFurniture.cpu().numpy())

            avg_val_loss = total_val_loss / len(val_data_loader)

            print(f"\tValidation Loss: {avg_val_loss:.4f}")

            report = classification_report(
                all_labels,
                all_preds,
                target_names=['Not Furniture', 'Furniture'],
                digits=4
            )

            print("\nClassification Report:")
            print(report)


In [224]:

name = 'cointegrated/rubert-tiny'
tokenizer = AutoTokenizer.from_pretrained(name)
base_bert = AutoModel.from_pretrained(name)

text = ["Replace me by any text you'd like.", "aboba"]

t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)
# print(t)
# with torch.no_grad():
#     output = model(**t)
# cls_token = output.last_hidden_state[:, 0, :]


Loading weights:   0%|          | 0/55 [00:00<?, ?it/s]

BertModel LOAD REPORT from: cointegrated/rubert-tiny
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
bert.embeddings.position_ids               | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [219]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weight = torch.tensor(df['isFurniture'].value_counts().to_list(), dtype=torch.float).to(device)

my_model = BertLinear(base_bert, freez_bert=False)
my_model.to(device)

optimizer = torch.optim.AdamW(params=my_model.parameters(), lr=1e-5)
#criterium = nn.CrossEntropyLoss(weight=torch.tensor([1/4.5, 1/6]).to(device))
criterium = nn.CrossEntropyLoss(weight=1 - 1/weight)

train_dataset = FurnitureDataset(X_train, y_train, tokenizer)

val_dataset = FurnitureDataset(X_val, y_val, tokenizer)

test_dataset = FurnitureDataset(X_test, y_test, tokenizer)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [220]:
train(my_model, train_data_loader, val_data_loader, criterium, optimizer, epochs=7)

100%|██████████| 23/23 [00:04<00:00,  5.02it/s]


Epoch 1 | Average Train Loss: 0.7054
Train Accuracy: 46.67%


100%|██████████| 7/7 [00:00<00:00, 12.24it/s]


	Validation Loss: 0.6576

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.7277    0.9329    0.8176       149
    Furniture     0.6000    0.2239    0.3261        67

     accuracy                         0.7130       216
    macro avg     0.6639    0.5784    0.5719       216
 weighted avg     0.6881    0.7130    0.6652       216



100%|██████████| 23/23 [00:04<00:00,  5.09it/s]


Epoch 2 | Average Train Loss: 0.6427
Train Accuracy: 67.50%


100%|██████████| 7/7 [00:00<00:00, 13.69it/s]


	Validation Loss: 0.5956

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.7129    1.0000    0.8324       149
    Furniture     1.0000    0.1045    0.1892        67

     accuracy                         0.7222       216
    macro avg     0.8565    0.5522    0.5108       216
 weighted avg     0.8020    0.7222    0.6329       216



100%|██████████| 23/23 [00:04<00:00,  5.09it/s]


Epoch 3 | Average Train Loss: 0.5906
Train Accuracy: 69.31%


100%|██████████| 7/7 [00:00<00:00, 13.48it/s]


	Validation Loss: 0.5422

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.7268    1.0000    0.8418       149
    Furniture     1.0000    0.1642    0.2821        67

     accuracy                         0.7407       216
    macro avg     0.8634    0.5821    0.5619       216
 weighted avg     0.8116    0.7407    0.6682       216



100%|██████████| 23/23 [00:04<00:00,  4.98it/s]


Epoch 4 | Average Train Loss: 0.5500
Train Accuracy: 73.06%


100%|██████████| 7/7 [00:00<00:00, 13.20it/s]


	Validation Loss: 0.4920

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.7708    0.9933    0.8680       149
    Furniture     0.9583    0.3433    0.5055        67

     accuracy                         0.7917       216
    macro avg     0.8646    0.6683    0.6868       216
 weighted avg     0.8290    0.7917    0.7556       216



100%|██████████| 23/23 [00:04<00:00,  4.91it/s]


Epoch 5 | Average Train Loss: 0.5081
Train Accuracy: 77.08%


100%|██████████| 7/7 [00:00<00:00, 13.41it/s]


	Validation Loss: 0.4388

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.8305    0.9866    0.9018       149
    Furniture     0.9487    0.5522    0.6981        67

     accuracy                         0.8519       216
    macro avg     0.8896    0.7694    0.8000       216
 weighted avg     0.8672    0.8519    0.8386       216



100%|██████████| 23/23 [00:04<00:00,  4.91it/s]


Epoch 6 | Average Train Loss: 0.4589
Train Accuracy: 83.33%


100%|██████████| 7/7 [00:00<00:00, 13.46it/s]


	Validation Loss: 0.3823

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.8951    0.9732    0.9325       149
    Furniture     0.9259    0.7463    0.8264        67

     accuracy                         0.9028       216
    macro avg     0.9105    0.8597    0.8795       216
 weighted avg     0.9046    0.9028    0.8996       216



100%|██████████| 23/23 [00:04<00:00,  5.02it/s]


Epoch 7 | Average Train Loss: 0.3946
Train Accuracy: 87.50%


100%|██████████| 7/7 [00:00<00:00, 13.66it/s]

	Validation Loss: 0.3198

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9236    0.9732    0.9477       149
    Furniture     0.9322    0.8209    0.8730        67

     accuracy                         0.9259       216
    macro avg     0.9279    0.8970    0.9104       216
 weighted avg     0.9262    0.9259    0.9245       216






In [21]:
#model with 2 linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=2e-5; bert grad freez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 12/12 [00:00<00:00, 37.46it/s]

Final Loss: 0.2742
Final Accuracy: 83.87%





In [17]:
#model with 1 linear layer; 5 epoch; lr=1e-3; bert grad freez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 12/12 [00:00<00:00, 38.18it/s]

Final Loss: 0.2987
Final Accuracy: 86.02%





In [25]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=1e-3; bert grad freez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 12/12 [00:00<00:00, 39.09it/s]

Final Loss: 0.2785
Final Accuracy: 90.32%





In [28]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 8 epoch; lr=1e-3; bert grad freez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 12/12 [00:00<00:00, 37.11it/s]

Final Loss: 0.2406
Final Accuracy: 86.02%





In [35]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=1e-3; bert grad unfreez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 11.06it/s]

Final Loss: 0.4599
Final Accuracy: 88.17%





In [61]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=2e-5; bert grad unfreez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 10.95it/s]

Final Loss: 0.2700
Final Accuracy: 89.25%





In [52]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=2e-5; bert grad unfreez; pooling_output
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 15.13it/s]

Final Loss: 0.3115
Final Accuracy: 87.10%





In [70]:
#model with droput, linear 312 2 ; 5 epoch; lr=2e-5; bert grad unfreez; pooling_output
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 14.48it/s]

Final Loss: 0.2782
Final Accuracy: 89.25%





In [83]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer ; 5 epoch; lr=1e-3; bert grad freez; pooling_output
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 11.05it/s]

Final Loss: 0.2870
Final Accuracy: 88.17%





In [95]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer ; 5 epoch; lr=1e-3; bert grad freez;
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 11.06it/s]

Final Loss: 0.3556
Final Accuracy: 88.17%





In [99]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 5 epoch; lr=2e-5; bert grad unfreez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 10.90it/s]

Final Loss: 0.4109
Final Accuracy: 86.02%





In [120]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 7 epoch; lr=1e-5; bert grad unfreez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 11.14it/s]

Final Loss: 0.3640

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9130    0.9403    0.9265        67
    Furniture     0.8333    0.7692    0.8000        26

     accuracy                         0.8925        93
    macro avg     0.8732    0.8548    0.8632        93
 weighted avg     0.8908    0.8925    0.8911        93






In [128]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 8 epoch; lr=1e-5; bert grad unfreez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 15.61it/s]


Final Loss: 0.3052

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9265    0.9403    0.9333        67
    Furniture     0.8400    0.8077    0.8235        26

     accuracy                         0.9032        93
    macro avg     0.8832    0.8740    0.8784        93
 weighted avg     0.9023    0.9032    0.9026        93



In [221]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 8 epoch; lr=1e-5; bert grad unfreez; weighted loss recall
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 10.55it/s]

Final Loss: 0.3653

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9014    0.9552    0.9275        67
    Furniture     0.8636    0.7308    0.7917        26

     accuracy                         0.8925        93
    macro avg     0.8825    0.8430    0.8596        93
 weighted avg     0.8908    0.8925    0.8896        93






In [217]:
#model with droput, linear 312 256,relu, dropout, 256 2  layer; 8 epoch; lr=1e-5; bert grad unfreez; weighted loss precision
test_model(my_model, test_data_loader, criterium)

100%|██████████| 3/3 [00:00<00:00, 11.07it/s]

Final Loss: 0.3239

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9014    0.9552    0.9275        67
    Furniture     0.8636    0.7308    0.7917        26

     accuracy                         0.8925        93
    macro avg     0.8825    0.8430    0.8596        93
 weighted avg     0.8908    0.8925    0.8896        93






In [225]:
# full training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weight = torch.tensor(df['isFurniture'].value_counts().to_list(), dtype=torch.float).to(device)

my_model = BertLinear(base_bert, freez_bert=False)
my_model.to(device)

optimizer = torch.optim.AdamW(params=my_model.parameters(), lr=1e-5)
criterium = nn.CrossEntropyLoss(weight=1 - 1/weight)

dataset = FurnitureDataset(df['label'].to_list(), df['isFurniture'].to_list(), tokenizer)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32)

train(my_model, data_loader, None, criterium, optimizer, epochs=7, isEval=False)

100%|██████████| 33/33 [00:06<00:00,  4.99it/s]


Epoch 1 | Average Train Loss: 0.6336
Train Accuracy: 67.35%


100%|██████████| 33/33 [00:06<00:00,  4.99it/s]


Epoch 2 | Average Train Loss: 0.5866
Train Accuracy: 70.36%


100%|██████████| 33/33 [00:06<00:00,  4.89it/s]


Epoch 3 | Average Train Loss: 0.5494
Train Accuracy: 72.59%


100%|██████████| 33/33 [00:06<00:00,  4.87it/s]


Epoch 4 | Average Train Loss: 0.5158
Train Accuracy: 76.68%


100%|██████████| 33/33 [00:06<00:00,  4.98it/s]


Epoch 5 | Average Train Loss: 0.4745
Train Accuracy: 80.17%


100%|██████████| 33/33 [00:06<00:00,  5.04it/s]


Epoch 6 | Average Train Loss: 0.4279
Train Accuracy: 83.87%


100%|██████████| 33/33 [00:06<00:00,  5.03it/s]

Epoch 7 | Average Train Loss: 0.3813
Train Accuracy: 86.78%





In [226]:
torch.save(my_model.state_dict(), 'furniture_model.pth')

In [227]:

my_model.bert.save_pretrained('./local_bert')
tokenizer.save_pretrained('./local_bert')

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./local_bert/tokenizer_config.json', './local_bert/tokenizer.json')

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn

In [2]:
class BertLinear(nn.Module):
    def __init__(self, bert_model, class_count=2, freez_bert=True):
        super().__init__()
        self.bert = bert_model
        self.bert.requires_grad_(not freez_bert)

        self.head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(312, 256),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(256, 2)
        )

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask)
        cls = output.last_hidden_state[:, 0, :]
        return self.head(cls)


In [3]:
bert_base = AutoModel.from_pretrained(r'D:\Uni\PythonProject\ParseURL\local_bert') 
tokenizer = AutoTokenizer.from_pretrained(r'D:\Uni\PythonProject\ParseURL\local_bert')

In [4]:
model = BertLinear(bert_base)
model.load_state_dict(torch.load(r'D:\Uni\PythonProject\ParseURL\furniture_model.pth', map_location='cpu'))

<All keys matched successfully>

In [10]:
t = tokenizer(text=["fashineble bed"], padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)

In [87]:
t.keys

<bound method Mapping.keys of {'input_ids': tensor([[    2,  1954, 19464,  6200,   694,  9557,     3]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}>

In [12]:
output = model(**t)

In [14]:
output.argmax(dim=-1).item()

1

In [20]:
output.softmax(dim=-1)

tensor([[0.3552, 0.6448]], grad_fn=<SoftmaxBackward0>)