In [1]:
# mean pooling попробовать среднее арифметическое для всех векторов слов
# CLS токен
# 2 линейных слоя с ReLu по центру

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [4]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

In [5]:
class FurnitureDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        inputs = self.tokenizer(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].squeeze(0), 
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'isFurniture': torch.tensor(label, dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)


In [6]:
# Base bert + linear head

class BertLinear(nn.Module):
    def __init__(self, bert_model, class_count=2, freez_bert=True):
        super().__init__()
        self.bert = bert_model
        self.bert.requires_grad_(not freez_bert)
        
        self.head = nn.Linear(312, 2)
        # self.head = nn.Sequential(
        #     nn.Linear(312, 256),
        #     nn.ReLU(),
        #     nn.Dropout(p=0.1),
        #     nn.Linear(256, 2)
        # )

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask)
        cls = output.last_hidden_state[:, 0, :]
        return self.head(cls)
        

In [7]:

name = 'cointegrated/rubert-tiny'
tokenizer = AutoTokenizer.from_pretrained(name)
base_bert = AutoModel.from_pretrained(name)

text = ["Replace me by any text you'd like.", "aboba"]

t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)
# print(t)
# with torch.no_grad():
#     output = model(**t)
# cls_token = output.last_hidden_state[:, 0, :]


In [None]:
with torch.no_grad():
    print(base_bert(**t))

In [13]:
df.columns

Index(['Unnamed: 0', 'url_id', 'label', 'isFurniture', 'label_len'], dtype='object')

In [9]:
df = pd.read_csv(r'D:\Uni\PythonProject\ParseURL\data\train_label_final.csv')

X_train, X_test, y_train, y_test = train_test_split(df['label'].to_list(), df['isFurniture'].to_list(), test_size=0.30, random_state=100)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.30, random_state=100)

In [10]:
print(len(X_train), len(X_val), len(X_test)) 

720 216 93


In [11]:
def test_model(model, test_data_loader, criterium):
    correct_pred = 0
    total_pred = 0
    total_test_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_data_loader):
            isFurniture = batch.pop("isFurniture")
            
            logits = model(**batch)
            loss = criterium(logits, isFurniture)
            prediction = logits.argmax(dim=-1)
            total_pred += len(prediction)
            correct_pred += (prediction == isFurniture).sum().item()
            total_test_loss += loss.item()
            
        avg_test_loss = total_test_loss / len(test_data_loader)
        accuracy = correct_pred / total_pred
        
        print(f"Final Loss: {avg_test_loss:.4f}")
        print(f"Final Accuracy: {accuracy:.2%}")

In [23]:
def train(model, train_data_loader, val_data_loader, criterium, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_pred = 0
        correct_pred = 0
    
        for batch in tqdm(train_data_loader):
            isFurniture = batch.pop("isFurniture")
    
            model.zero_grad()
            
            logits = model(**batch)
    
            prediction = logits.argmax(dim=-1)
            total_pred += len(prediction)
            correct_pred += (prediction == isFurniture).sum().item()
    
            loss = criterium(logits, isFurniture)
            total_loss += loss.item()
    
            loss.backward()
    
            optimizer.step()
    
        avg_train_loss = total_loss / len(train_data_loader)
        print(f"Epoch {epoch+1} | Average Train Loss: {avg_train_loss:.4f}")
        accuracy = correct_pred / total_pred
        print(f"Train Accuracy: {accuracy:.2%}")
    
        model.eval()
    
        all_labels = []
        all_preds = []
        
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_data_loader):
                isFurniture = batch.pop("isFurniture")
                
                logits = model(**batch)
                loss = criterium(logits, isFurniture)
                prediction = logits.argmax(dim=-1)
                total_val_loss += loss.item()

                all_preds.extend(prediction.cpu().numpy())
                all_labels.extend(isFurniture.cpu().numpy())
                
            avg_val_loss = total_val_loss / len(val_data_loader)
            
            print(f"\tValidation Loss: {avg_val_loss:.4f}")
            
            report = classification_report(
                all_labels, 
                all_preds, 
                target_names=['Not Furniture', 'Furniture'],
                digits=4
            )
            
            print("\nClassification Report:")
            print(report)


In [25]:
my_model = BertLinear(base_bert)

optimizer = torch.optim.AdamW(params=my_model.parameters(), lr=1e-3)
criterium = nn.CrossEntropyLoss()

train_dataset = FurnitureDataset(X_train, y_train, tokenizer)

val_dataset = FurnitureDataset(X_val, y_val, tokenizer)

test_dataset = FurnitureDataset(X_test, y_test, tokenizer)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

In [27]:
train(my_model, train_data_loader, val_data_loader, criterium, optimizer)

100%|██████████| 90/90 [02:29<00:00,  1.67s/it]


Epoch 1 | Average Train Loss: 0.5009
Train Accuracy: 76.67%


100%|██████████| 27/27 [00:06<00:00,  4.01it/s]


	Validation Loss: 0.3666

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.8861    0.9396    0.9121       149
    Furniture     0.8448    0.7313    0.7840        67

     accuracy                         0.8750       216
    macro avg     0.8655    0.8355    0.8480       216
 weighted avg     0.8733    0.8750    0.8723       216



100%|██████████| 90/90 [02:29<00:00,  1.66s/it]


Epoch 2 | Average Train Loss: 0.3751
Train Accuracy: 86.94%


100%|██████████| 27/27 [00:06<00:00,  4.39it/s]


	Validation Loss: 0.2960

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9342    0.9530    0.9435       149
    Furniture     0.8906    0.8507    0.8702        67

     accuracy                         0.9213       216
    macro avg     0.9124    0.9019    0.9069       216
 weighted avg     0.9207    0.9213    0.9208       216



100%|██████████| 90/90 [02:33<00:00,  1.70s/it]


Epoch 3 | Average Train Loss: 0.3319
Train Accuracy: 87.50%


100%|██████████| 27/27 [00:06<00:00,  4.39it/s]


	Validation Loss: 0.2621

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9342    0.9530    0.9435       149
    Furniture     0.8906    0.8507    0.8702        67

     accuracy                         0.9213       216
    macro avg     0.9124    0.9019    0.9069       216
 weighted avg     0.9207    0.9213    0.9208       216



100%|██████████| 90/90 [02:33<00:00,  1.71s/it]


Epoch 4 | Average Train Loss: 0.3142
Train Accuracy: 87.78%


100%|██████████| 27/27 [00:08<00:00,  3.25it/s]


	Validation Loss: 0.2443

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9221    0.9530    0.9373       149
    Furniture     0.8871    0.8209    0.8527        67

     accuracy                         0.9120       216
    macro avg     0.9046    0.8870    0.8950       216
 weighted avg     0.9112    0.9120    0.9111       216



100%|██████████| 90/90 [02:32<00:00,  1.69s/it]


Epoch 5 | Average Train Loss: 0.3038
Train Accuracy: 89.17%


100%|██████████| 27/27 [00:06<00:00,  4.33it/s]

	Validation Loss: 0.2392

Classification Report:
               precision    recall  f1-score   support

Not Furniture     0.9658    0.9463    0.9559       149
    Furniture     0.8857    0.9254    0.9051        67

     accuracy                         0.9398       216
    macro avg     0.9257    0.9358    0.9305       216
 weighted avg     0.9409    0.9398    0.9402       216






In [None]:
#model with 2 linear 312 256,relu, 256 2  layer; 10 epoch; lr=2e-5; bert grad freez
test_model(my_model, test_data_loader, criterium)

In [29]:
#model with 1 linear layer; 10 epoch; lr=1e-3; bert grad freez
test_model(my_model, test_data_loader, criterium)

100%|██████████| 12/12 [00:02<00:00,  4.46it/s]

Final Loss: 0.2906
Final Accuracy: 88.17%



