# Load packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn
import torch.optim as optim

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel

import matplotlib.pyplot as plt

import time

# Wandb login & Config & Loading Data

In [2]:
#import wandb
#from kaggle_secrets import UserSecretsClient
#user_secrets = UserSecretsClient()
#secret_value_0 = user_secrets.get_secret("wandb_key")

#wandb.login(key = secret_value_0)

In [3]:
class config:
    seed = 42
    
    ## train parameters
    BATCH_SIZE = 64
    EPOCHES = 200
    num_warmup_rate=0.05
    n_clusters = 140
    model_name = 'BERT-MLP'

In [4]:
df_b = pd.read_csv('/home/zhaojinyue/workspace/intent-cluster/archive/data/banking/train.tsv', sep='\t')
df_c = pd.read_csv('/home/zhaojinyue/workspace/intent-cluster/archive/data/clinc/train.tsv', sep='\t')
df_b_te = pd.read_csv('/home/zhaojinyue/workspace/intent-cluster/archive/data/clinc/test.tsv', sep='\t')
df_b_dev = pd.read_csv('/home/zhaojinyue/workspace/intent-cluster/archive/data/banking/dev.tsv', sep='\t')

In [5]:
df_b.head()

Unnamed: 0,text,label
0,Could you help my figure out the exchange fee?,exchange_charge
1,I made a cash deposit to my account but i don'...,balance_not_updated_after_cheque_or_cash_deposit
2,Hello - I'm on the app and trying to purchase ...,beneficiary_not_allowed
3,Why is it saying I have a pending payment?,pending_card_payment
4,Is there an extra charge to exchange different...,exchange_charge


# class mapping dict

In [6]:
df = df_b
df_te = df_b_te
num_classes = df['label'].nunique()*2

In [7]:
label_mapping = {v:i for i,v in enumerate(df['label'].unique())}
df['label_num'] = df['label'].map(label_mapping)
df_te['label_num'] = df_te['label'].map(label_mapping)

# Dataset

In [8]:
tokenizer = BertTokenizer.from_pretrained('/home/zhaojinyue/workspace/intent-cluster/bert')

class CustomDataset(Dataset):
    def __init__(self, dataframe, num_classes=20):
        self.data = dataframe
        self.num_classes = num_classes
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.loc[index, 'text']
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True, max_length=128, padding='max_length')
        label = self.data.loc[index, 'label_num']
        one_hot_label = F.one_hot(torch.tensor(label), num_classes=self.num_classes)
        inputs_ids = encoded_input['input_ids'].squeeze(0)
        attention_mask = encoded_input['attention_mask'].squeeze(0)
        return inputs_ids, attention_mask, one_hot_label

In [9]:
dataset = CustomDataset(df, num_classes)
val_dataset = CustomDataset(df_te, num_classes)

In [10]:
batch_size = config.BATCH_SIZE  # 批量大小
shuffle = True  # 打乱数据
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size*2)

# Build Model

In [11]:
class BERTsemi(nn.Module):
    def __init__(self, num_classes):
        super(BERTsemi, self).__init__()
        self.bert = BertModel.from_pretrained('/home/zhaojinyue/workspace/intent-cluster/bert')
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 256)
        self.fc3 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.last_hidden_states
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        hidden = self.relu(self.fc1(pooled_output))
        logits = self.softmax(self.fc3(hidden))
        return pooled_output,logits

model = BERTsemi(num_classes=num_classes)
model_dict = model.state_dict()
pretrained_dict = torch.load('/home/zhaojinyue/workspace/intent-cluster/model_best_tr.pth',map_location=torch.device('cpu'))
pretrained_dict = {key: value for key, value in pretrained_dict.items() if key in model_dict }
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

Some weights of the model checkpoint at /home/zhaojinyue/workspace/intent-cluster/bert were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTsemi(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

# wandb.init() & loss & optim & scheduler

In [12]:
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))
import wandb
wandb.init(project='NLP-intent-BERT-semi-supuervised', 
    name=config.model_name,
    config=class2dict(config),
    group=config.model_name,
    job_type="train",
    anonymous="must")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: - Waiting for wandb.init()...

CommError: Run initialization has timed out after 90.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

## Define Loss

In [None]:
class SupConLoss(nn.Module):
    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
    It also supports the unsupervised contrastive loss in SimCLR"""
    def __init__(self, contrast_mode='all'):
        super(SupConLoss, self).__init__()
        self.contrast_mode = contrast_mode

    def forward(self, features, labels=None, mask=None, temperature = 0.07, device = None):
        """Compute loss for model. If both `labels` and `mask` are None,
        it degenerates to SimCLR unsupervised loss:
        https://arxiv.org/pdf/2002.05709.pdf
        Args:
            features: hidden vector of shape [bsz, n_views, ...].
            labels: ground truth of shape [bsz].
            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
                has the same class as sample i. Can be asymmetric.
        Returns:
            A loss scalar.
        """

        if len(features.shape) < 3:
            raise ValueError('`features` needs to be [bsz, n_views, ...],'
                             'at least 3 dimensions are required')
        if len(features.shape) > 3:
            features = features.view(features.shape[0], features.shape[1], -1)

        batch_size = features.shape[0]
        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_count = features.shape[1]
        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
        if self.contrast_mode == 'one':
            anchor_feature = features[:, 0]
            anchor_count = 1
        elif self.contrast_mode == 'all':
            anchor_feature = contrast_feature
            anchor_count = contrast_count
        else:
            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))

        # compute logits
        anchor_dot_contrast = torch.div(
            torch.matmul(anchor_feature, contrast_feature.T),
            temperature)
        # for numerical stability
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
        logits = anchor_dot_contrast - logits_max.detach()

        # tile mask
        mask = mask.repeat(anchor_count, contrast_count)
        # mask-out self-contrast cases
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
            0
        )
        mask = mask * logits_mask

        # compute log_prob
        exp_logits = torch.exp(logits) * logits_mask
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))

        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)

        # loss
        loss = - mean_log_prob_pos
        loss = loss.view(anchor_count, batch_size).mean()

        return loss

In [None]:
each_epoch_steps = len(dataloader)
epoches = config.EPOCHES

criterion = nn.CrossEntropyLoss()
contrast_criterion = SupConLoss()

optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-6)
l_optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-6)

from transformers import get_cosine_schedule_with_warmup
num_train_steps = config.EPOCHES*each_epoch_steps
num_warmup_steps = int(num_train_steps*config.num_warmup_rate)
## scheduler
scheduler = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, num_cycles=0.5)

l_scheduler = get_cosine_schedule_with_warmup(
    l_optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps, num_cycles=0.5)



# Train

In [None]:
from sklearn.cluster import KMeans

feat_dim = 768

In [None]:
# from tqdm import tqdm
from tqdm.notebook import tqdm

def get_outputs(mode, model):    
    if mode == 'test':
        dataloader_ = val_dataloader
    elif mode == 'train':
        dataloader_ = dataloader
    model.eval()

    total_labels = torch.empty(0,dtype=torch.float).to(device)#创建空list
    total_preds = torch.empty(0,dtype=torch.long).to(device)
        
    total_features = torch.empty((0, feat_dim)).to(device)
    total_logits = torch.empty((0, num_classes)).to(device)
    
    for input_ids, attention_mask, labels in tqdm(dataloader_, desc="Iteration"):
        input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)

        with torch.set_grad_enabled(False):
            feats,logits = model(input_ids, attention_mask)
                
            total_labels = torch.cat((total_labels,labels.argmax(axis=1)))
            total_features = torch.cat((total_features, feats))
            total_logits = torch.cat((total_logits, logits))
        
    feats = total_features.cpu().numpy()
    y_true = total_labels.cpu().numpy()
        
    total_probs = F.softmax(total_logits.detach(), dim=1)
    total_maxprobs, total_preds = total_probs.max(dim = 1)
    y_pred = total_preds.cpu().numpy()
        
    y_logits = total_logits.cpu().numpy()
        
    outputs = {
        'y_true': y_true,
        'y_pred': y_pred,
        'logits': y_logits,
        'feats': feats
    }
    return outputs

In [None]:
## dui
def clustering(model):
    outputs = get_outputs(mode = 'train', model = model)
    feats = outputs['feats']
    y_true = outputs['y_true']
        
    labeled_pos = list(np.where(y_true != -1)[0])
    labeled_feats = feats[labeled_pos]
    labeled_labels = y_true[labeled_pos]        
    labeled_centers = []
    for idx, label in enumerate(np.unique(labeled_labels)):
        label_feats = labeled_feats[labeled_labels == label]
        labeled_centers.append(np.mean(label_feats, axis = 0))
        
    km = KMeans(n_clusters = num_classes, random_state=config.seed, init = 'k-means++').fit(feats) 
    km_centroids, assign_labels = km.cluster_centers_, km.labels_
         
    centroids = torch.tensor(km_centroids).to(device)
    pseudo_labels = assign_labels.astype(np.int64)
        
    return outputs, km_centroids, y_true, assign_labels, pseudo_labels

In [None]:
last_preds = None

for epoch in range(epoches):
    ## train
    model.train()
    st = time.time()
    for input_ids, attention_mask, labels in dataloader:
        input_ids, attention_mask, labels = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels.to(device, dtype=torch.float)

        feats_a,logits_a = model(input_ids, attention_mask)
        feats_b,logits_b = model(input_ids, attention_mask)

        norm_feats_a = F.normalize(feats_a)
        norm_feats_b = F.normalize(feats_b)
        
        constrastive_feats = torch.cat((norm_feats_a.unsqueeze(1), norm_feats_b.unsqueeze(1)), dim = 1)
        
        ## 计算对比学习Loss，使用的simCLR 的loss https://arxiv.org/pdf/2002.05709.pdf
        loss_contrast = contrast_criterion(constrastive_feats, labels = labels.argmax(axis=1), temperature = 0.07, device = device)
        
        loss = loss_contrast
        
        loss.backward()
        l_optimizer.step()
        l_optimizer.zero_grad()        
        l_scheduler.step()
        
    ## 更新质心和伪标签
    outputs, km_centroids, y_true, assign_labels, pseudo_labels = clustering(model)
    
    current_preds = pseudo_labels
    
    ## 计算当前 两次伪标签的距离小于某个值时可以用于提前停止，目前暂未使用
#     delta_label = np.sum(current_preds != last_preds).astype(np.float32)/ current_preds.shape[0]
#     last_preds = np.copy(current_preds)
    
    ## 质心引导，对比学习训练（此处输入label为伪标签）
    losses2 = []
    model.train()
    for i, (input_ids, attention_mask, labels) in enumerate(dataloader):
        labels_ = torch.tensor(pseudo_labels[batch_size*i:batch_size*(i+1)])
        labels_ = F.one_hot(labels_, num_classes=num_classes)
        input_ids, attention_mask, labels_ = input_ids.to(device, dtype=torch.long), attention_mask.to(device, dtype=torch.long), labels_.to(device, dtype=torch.float)
        # random
        feats_a,logits_a = model(input_ids, attention_mask)
        feats_b,logits_b = model(input_ids, attention_mask)
    
        norm_feats_a = F.normalize(feats_a)
        norm_feats_b = F.normalize(feats_b)
        
        ## 计算对比学习
        constrastive_feats = torch.cat((norm_feats_a.unsqueeze(1), norm_feats_b.unsqueeze(1)), dim = 1)
        loss_contrast = contrast_criterion(constrastive_feats, labels = labels_.argmax(axis=1), temperature = 0.07, device = device)
        
        ## 伪标签与预测差异loss
        loss_ce = 0.5 * (criterion(logits_a, labels_) + criterion(logits_b, labels_)) 
                    
        loss = loss_contrast + loss_ce

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses2.append(loss.item())
        scheduler.step()
    ed = time.time()
    print(f'[Epoch {epoch+1}/{epoches}] Train Loss: {np.mean(losses2):.2f}, time: {ed-st:.0f}s')
    # wandb
    wandb.log({
        f"Epoch": epoch+1,
        f"avg_train_loss": np.mean(losses2),
    })

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 91.75 MiB is free. Process 11160 has 5.08 GiB memory in use. Including non-PyTorch memory, this process has 18.49 GiB memory in use. Of the allocated memory 17.96 GiB is allocated by PyTorch, and 228.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF