# 1. 参数配置

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel
from torch.distributions import MultivariateNormal, Categorical

import matplotlib.pyplot as plt

import time

In [2]:
## wandb login
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("wandb_key")

wandb.login(key = secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
## Config
class config:
    seed = 42
    
    ## train parameters
    BATCH_SIZE = 64
    EPOCHES = 200
    num_warmup_rate=0.05
    n_clusters = 140
    model_name = 'VAE+BN'

# 2. 数据准备

In [4]:
## Load data
df_b = pd.read_csv('/kaggle/input/intent-dataset/data/banking/train.tsv', sep='\t')
df_c = pd.read_csv('/kaggle/input/intent-dataset/data/clinc/train.tsv', sep='\t')
df_b_te = pd.read_csv('/kaggle/input/intent-dataset/data/banking/test.tsv', sep='\t')
df_b_dev = pd.read_csv('/kaggle/input/intent-dataset/data/banking/dev.tsv', sep='\t')

In [5]:
df = df_b
df_te = df_b_te
num_classes = df['label'].nunique()#*2

In [6]:
label_mapping = {v:i for i,v in enumerate(df['label'].unique())}
df['label_num'] = df['label'].map(label_mapping)
df_te['label_num'] = df_te['label'].map(label_mapping)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, dataframe, num_classes=20):
        self.data = dataframe
        self.num_classes = num_classes
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.loc[index, 'text']
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True, max_length=128, padding='max_length')
        label = self.data.loc[index, 'label_num']
        one_hot_label = F.one_hot(torch.tensor(label), num_classes=self.num_classes)
        inputs_ids = encoded_input['input_ids'].squeeze(0)
        attention_mask = encoded_input['attention_mask'].squeeze(0)
        return inputs_ids, attention_mask, one_hot_label

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
dataset = CustomDataset(df, num_classes)
val_dataset = CustomDataset(df_te, num_classes)

In [9]:
batch_size = config.BATCH_SIZE  # 批量大小
shuffle = True  # 打乱数据
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size*2)

# 3.模型定义

In [10]:
# 加载BERT和GPT-2
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

class VAE_DEC(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(VAE_DEC, self).__init__()
        self.bert_encoder = bert_model
        self.latent_dim = bert_model.config.hidden_size
        
        self.fc_mu = nn.Linear(self.latent_dim, self.latent_dim)
        self.fc_logvar = nn.Linear(self.latent_dim, self.latent_dim)
        self.bn_mu = nn.BatchNorm1d(self.latent_dim)  # 添加批量归一化层
        
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(self.latent_dim, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)        
        
    def encode(self, input_ids, attention_mask=None):
        outputs = self.bert_encoder(input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # 使用[CLS] token表示
#         mu = self.fc_mu(hidden_state)
        mu = self.bn_mu(self.fc_mu(hidden_state))
#         logvar = self.fc_logvar(hidden_state)
        return mu
    
#     def reparameterize(self, mu, logvar):
#         std = torch.exp(0.5 * logvar)
#         eps = torch.randn_like(std)
#         return mu + eps * std
        
    def forward(self, input_ids, attention_mask):
        mu = self.encode(input_ids, attention_mask)
#         mu = self.dropout(z)
        hidden = self.relu(self.fc1(mu))
        logits = self.softmax(self.fc2(hidden))
        return mu, logits

bert_model = BertModel.from_pretrained('bert-base-uncased')

model = VAE_DEC(bert_model, num_classes = num_classes )

model_dict = model.state_dict()

pretrained_dict = torch.load('/kaggle/input/nlp-intent-vae-dec/model_epoch2.pth',map_location=torch.device('cpu'))
# pretrained_dict = torch.load('/kaggle/input/nlp-intent-vae-dec/model_epoch1.pth',map_location=torch.device('cpu'))
pretrained_dict = {key: value for key, value in pretrained_dict.items() if key in model_dict }
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

VAE_DEC(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

# 3.损失函数和优化器

In [11]:
## wandb.init()
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

wandb.init(project='NLP-intent-VAE-supuervised', 
    name=config.model_name,
    config=class2dict(config),
    group=config.model_name,
    job_type="train",
    anonymous="must")

[34m[1mwandb[0m: Currently logged in as: [33mmengvision[0m ([33mnumberist[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
## SupConLoss Define
class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""
    def __init__(self, contrast_mode='all'):
        super(SupConLoss, self).__init__()
        self.contrast_mode = contrast_mode

    def forward(self, features, labels=None, mask=None, temperature = 0.07, device = None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss

In [13]:
## Loss
each_epoch_steps = len(dataloader)
epoches = config.EPOCHES

criterion = nn.CrossEntropyLoss()
contrast_criterion = SupConLoss()

In [14]:
## Optimizer
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-6)
l_optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-6)

## Scheduler
from transformers import get_cosine_schedule_with_warmup
num_train_steps = config.EPOCHES*each_epoch_steps
num_warmup_steps = int(num_train_steps*config.num_warmup_rate)
scheduler = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, num_cycles=0.5)

l_scheduler = get_cosine_schedule_with_warmup(
    l_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, num_cycles=0.5)

# 4.训练模型

In [15]:
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm

feat_dim = 768

In [16]:
def get_outputs(mode, model):    
    if mode == 'test':
        dataloader_ = val_dataloader
    elif mode == 'train':
        dataloader_ = dataloader
    model.eval()

    total_labels = torch.empty(0,dtype=torch.float).to(device)#创建空list
    total_preds = torch.empty(0,dtype=torch.long).to(device)
        
    total_features = torch.empty((0, feat_dim)).to(device)
    total_logits = torch.empty((0, num_classes)).to(device)
    
    for input_ids, attention_mask, labels in tqdm(dataloader_, desc="Iteration"):
        input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)

        with torch.set_grad_enabled(False):
            feats,logits = model(input_ids, attention_mask)
                
            total_labels = torch.cat((total_labels,labels.argmax(axis=1)))
            total_features = torch.cat((total_features, feats))
            total_logits = torch.cat((total_logits, logits))
        
    feats = total_features.cpu().numpy()
    y_true = total_labels.cpu().numpy()
        
    total_probs = F.softmax(total_logits.detach(), dim=1)
    total_maxprobs, total_preds = total_probs.max(dim = 1)
    y_pred = total_preds.cpu().numpy()
        
    y_logits = total_logits.cpu().numpy()
        
    outputs = {
        'y_true': y_true,
        'y_pred': y_pred,
        'logits': y_logits,
        'feats': feats
    }
    return outputs

In [17]:
# outputs = get_outputs(mode = 'test', model = model)

In [18]:
def clustering(model):
    outputs = get_outputs(mode = 'train', model = model)
    feats = outputs['feats']
    y_true = outputs['y_true']
        
    labeled_pos = list(np.where(y_true != -1)[0])
    labeled_feats = feats[labeled_pos]
    labeled_labels = y_true[labeled_pos]        
    labeled_centers = []
    for idx, label in enumerate(np.unique(labeled_labels)):
        label_feats = labeled_feats[labeled_labels == label]
        labeled_centers.append(np.mean(label_feats, axis = 0))
        
    km = KMeans(n_clusters=num_classes, random_state=config.seed, init='k-means++').fit(feats) 
    km_centroids, assign_labels = km.cluster_centers_, km.labels_
         
    centroids = torch.tensor(km_centroids).to(device)
    pseudo_labels = assign_labels.astype(np.int64)
        
    return outputs, km_centroids, y_true, assign_labels, pseudo_labels

In [19]:
## evalute
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import accuracy_score

# 定义真实标签
true_labels = [0, 0, 1, 1, 1]

# 定义聚类结果
cluster_labels = [0, 0, 1, 1, 1]

def evalute(true_labels, cluster_labels):
    # 计算归一化互信息
    nmi = normalized_mutual_info_score(true_labels, cluster_labels, average_method='arithmetic')
    # 计算调整兰德系数
    ari = adjusted_rand_score(true_labels, cluster_labels)
    # 计算聚类准确率
    acc = accuracy_score(true_labels, cluster_labels)

#     print(f"Normalized Mutual Information (NMI): {nmi:.3f}")
#     print(f"Adjusted Rand Index (ARI): {ari:.3f}")
#     print(f"Clustering Accuracy (ACC): {acc:.3f}")
    return nmi, ari, acc

# nmi, ari, acc = evalute(true_labels, cluster_labels)

# https://arxiv.org/pdf/2304.07699
# ===========================
# | NMI    | ARI    | ACC   |
# | 87.41  | 69.54  | 78.36 |


In [20]:
# ## Traing
# last_preds = None

# for epoch in range(epoches):
#     ## train
#     model.train()
#     st = time.time()
#     losses = []
#     for input_ids, attention_mask, labels in dataloader:
#         input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)

#         feats_a,logits_a = model(input_ids, attention_mask)
#         feats_b,logits_b = model(input_ids, attention_mask)

#         norm_feats_a = F.normalize(feats_a)
#         norm_feats_b = F.normalize(feats_b)
        
#         constrastive_feats = torch.cat((norm_feats_a.unsqueeze(1), norm_feats_b.unsqueeze(1)), dim = 1)
        
#         ## 计算对比学习Loss，使用的simCLR 的loss https://arxiv.org/pdf/2002.05709.pdf
#         loss_contrast = contrast_criterion(constrastive_feats, labels = labels.argmax(axis=1), temperature = 0.07, device = device)
        
        
#         loss = loss_contrast
        
#         losses.append(loss.item())
        
#         loss.backward()
#         l_optimizer.step()
#         l_optimizer.zero_grad()        
#         l_scheduler.step()
        
#     ed = time.time()
#     print(f'[Epoch {epoch+1}/{epoches}] Train Loss: {np.mean(losses):.2f}, time: {ed-st:.0f}s')
#     ## 更新质心和伪标签
#     outputs, km_centroids, y_true, assign_labels, pseudo_labels = clustering(model)
    
#     current_preds = pseudo_labels
#     evalute(outputs['y_true'], outputs['y_pred'])
#     ## 计算当前 两次伪标签的距离小于某个值时可以用于提前停止，目前暂未使用
# #     delta_label = np.sum(current_preds != last_preds).astype(np.float32)/ current_preds.shape[0]
# #     last_preds = np.copy(current_preds)
    
#     ## 质心引导，对比学习训练（此处输入label为伪标签）
#     losses2 = []
# #     model.train()
#     for i, (input_ids, attention_mask, labels) in enumerate(dataloader):
#         labels_ = torch.tensor(pseudo_labels[batch_size*i:batch_size*(i+1)])
#         labels_ = F.one_hot(labels_, num_classes=num_classes)
#         input_ids, attention_mask, labels_ = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels_.to(device, dtype=torch.float)
#         # random
#         feats_a,logits_a = model(input_ids, attention_mask)
#         feats_b,logits_b = model(input_ids, attention_mask)
    
#         norm_feats_a = F.normalize(feats_a)
#         norm_feats_b = F.normalize(feats_b)
        
#         ## 计算对比学习
#         constrastive_feats = torch.cat((norm_feats_a.unsqueeze(1), norm_feats_b.unsqueeze(1)), dim = 1)
#         loss_contrast = contrast_criterion(constrastive_feats, labels = labels_.argmax(axis=1), temperature = 0.07, device = device)
        
#         ## 伪标签与预测差异loss
#         loss_ce = 0.5 * (criterion(logits_a, labels_) + criterion(logits_b, labels_)) 
                    
#         loss = loss_contrast + loss_ce

#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         losses2.append(loss.item())
#         scheduler.step()
#     ed = time.time()
#     print(f'[Epoch {epoch+1}/{epoches}] Train Loss: {np.mean(losses2):.2f}, time: {ed-st:.0f}s')
# #     # wandb
# #     wandb.log({
# #         f"Epoch": epoch+1,
# #         f"avg_train_loss": np.mean(losses2),
# #     })

In [None]:
lr_list = []
loss_list = []
t_loss_list = []
acc_list = []
best_loss = 1_000
for epoch in range(epoches):
    losses = []
    ## train
    model.train()
    st = time.time()
    for input_ids, attention_mask, labels in dataloader:
        input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)

        z, logits = model(input_ids, attention_mask)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()        
        losses.append(loss.item())
        scheduler.step()
        # 更新学习率
    
    v_losses = []
    v_accs = []
    ## validation
    model.eval()
    prs=[]
    gts=[]
    for input_ids, attention_mask, labels in val_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)
        with torch.no_grad():
            z,logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            v_losses.append(loss.item())
            pr = [p.argmax().cpu().numpy() for p in logits]
            gt = [l.argmax().cpu().numpy() for l in labels]
            prs.extend(pr)
            gts.extend(gt)
            acc_tmp = [p==g for p,g in zip(pr,gt)]
            v_accs.append(sum(acc_tmp)/len(acc_tmp))
    ed = time.time()
    lr_cur = optimizer.param_groups[0]['lr']
    nmi, ari, acc = evalute(gts, prs)
    print(f'[Epoch {epoch+1}/{epoches}] Train Loss: {np.mean(losses):.2f}, Test Loss: {np.mean(v_losses):.2f}, Test acc: {np.mean(v_accs):.2f}, Test evaluate: {nmi}, {ari}, {acc}, lr: {lr_cur}, time: {ed-st:.0f}s')
    # wandb
    wandb.log({
        f"Epoch": epoch+1,
        f"avg_train_loss": np.mean(losses),
        f"avg_test_loss": np.mean(v_losses),
        f"avg_test_acc": np.mean(v_accs),
        f"nmi": nmi,
        f"ari": ari,
        f"acc": acc,
        f"lr": lr_cur
    })
    if best_loss > np.mean(losses):
        best_loss = np.mean(losses)
        model_path = 'model_best_tr.pth'
        torch.save(model.state_dict(), model_path)
    
    loss_list.append(np.mean(losses))
    t_loss_list.append(np.mean(v_losses))
    acc_list.append(np.mean(v_accs))
    lr_list.append(lr_cur)

[Epoch 1/200] Train Loss: 4.34, Test Loss: 4.34, Test acc: 0.02, Test evaluate: 0.23462381038353725, 0.015279024927785295, 0.020454545454545454, lr: 2.0000000000000003e-06, time: 121s
[Epoch 2/200] Train Loss: 4.34, Test Loss: 4.34, Test acc: 0.09, Test evaluate: 0.2799045074713729, 0.028351555228067066, 0.08636363636363636, lr: 4.000000000000001e-06, time: 122s
[Epoch 3/200] Train Loss: 4.34, Test Loss: 4.32, Test acc: 0.29, Test evaluate: 0.5264191273868318, 0.12845929492220845, 0.28733766233766234, lr: 6e-06, time: 122s
[Epoch 4/200] Train Loss: 4.25, Test Loss: 4.17, Test acc: 0.27, Test evaluate: 0.5375180474932924, 0.1477511171289806, 0.2714285714285714, lr: 8.000000000000001e-06, time: 122s
[Epoch 5/200] Train Loss: 4.14, Test Loss: 4.12, Test acc: 0.30, Test evaluate: 0.5571356844119775, 0.16550950396851916, 0.29967532467532465, lr: 1e-05, time: 122s
[Epoch 6/200] Train Loss: 4.08, Test Loss: 4.06, Test acc: 0.33, Test evaluate: 0.5724919297913799, 0.1801452527900451, 0.3181818