In [1]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
from datetime import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [2]:
run_name = f"BERT_{datetime.now().strftime('%m%d_%H%M%S')}"

In [3]:
matplotlib.rcParams['font.sans-serif'] = ['Noto Sans CJK JP']
matplotlib.rcParams['axes.unicode_minus'] = False

In [4]:
class PerfumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("/root/bert-base-chinese")
model = BertForSequenceClassification.from_pretrained("/root/bert-base-chinese", num_labels=8).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
df = pd.read_csv("data/1976_clean.csv",index_col=0)
texts = df['notes_list'].tolist()
le = LabelEncoder()
labels = le.fit_transform(df['fragrance'])
dataset = PerfumeDataset(texts, labels, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [7]:
def train_one_epoch():
    model.train()
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return loss.item()

In [8]:
def evaluate(epoch):
    model.eval()
    embeddings = []
    with torch.no_grad():
        # 在这里加 tqdm 包裹 texts
        for text in tqdm(texts, desc="Evaluating"):
            encoding = tokenizer(
                text, 
                truncation=True, 
                padding='max_length', 
                max_length=128, 
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_embedding)

    embeddings = np.array(embeddings)

    # PCA
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)
    
    os.makedirs(f"PCA/{run_name}", exist_ok=True)
    plt.figure(figsize=(10,8))
    scatter = plt.scatter(
        embeddings_2d[:,0], 
        embeddings_2d[:,1], 
        c=labels, 
        cmap='tab10', 
        alpha=0.7
    )
    plt.title(f"Epoch {epoch} - PCA Clustering")
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.legend(
        handles=scatter.legend_elements()[0],
        labels=[str(x) for x in le.classes_[:len(scatter.legend_elements()[0])]],
        title='Fragrance',
        bbox_to_anchor=(1.05, 1),
        loc='upper left'
    )
    plt.tight_layout()
    plt.savefig(f"PCA/{run_name}/{epoch}.png")
    plt.close()


In [9]:
epochs = 10
for ep in range(1,epochs+1):
    print(f"====== Epoch {ep} ======")
    loss = train_one_epoch()
    print(f"Loss: {loss}")
    evaluate(ep)



Training: 100%|██████████| 132/132 [00:12<00:00, 10.49it/s]


Loss: 1.5279735326766968


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.69it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.92it/s]


Loss: 1.1394894123077393


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.51it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.89it/s]


Loss: 1.0095570087432861


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.43it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.90it/s]


Loss: 1.1059889793395996


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 125.61it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.88it/s]


Loss: 1.1219775676727295


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 125.65it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.89it/s]


Loss: 0.5136028528213501


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.51it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.88it/s]


Loss: 0.2012988179922104


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.01it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.87it/s]


Loss: 0.5643333792686462


Evaluating: 100%|██████████| 2104/2104 [00:18<00:00, 112.85it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.90it/s]


Loss: 1.3746041059494019


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 127.61it/s]




Training: 100%|██████████| 132/132 [00:12<00:00, 10.88it/s]


Loss: 0.1481604278087616


Evaluating: 100%|██████████| 2104/2104 [00:16<00:00, 126.55it/s]
