In [1]:
import sys
import os
from model import BertEmbeddings
from model import BertAttention
from model import BertLayer
from model import BertEncoder
from model import BertModel
from model import BertConfig
import torch
from utils.log_helper import logger_init

## Data Preprocessing Principles
In single-sentence classification, data preprocessing transforms raw text into model-ready tensor inputs. Key principles include:

1. **Tokenization**: Use BERT’s WordPiece tokenizer to split sentences into subword tokens; out-of-vocabulary words are marked as `[UNK]` to maintain a fixed vocabulary size.

In [2]:
from transformers import BertTokenizer
from Tasks.TaskForSingleSentenceClassification import ModelConfig

model_config = ModelConfig()
tokenizer = BertTokenizer.from_pretrained(model_config.pretrained_model_dir).tokenize
print(tokenizer("青山不改，绿水长流，我们月来客栈见！"))
print(tokenizer("10年前的今天，纪念5.12汶川大地震10周年"))

['青', '山', '不', '改', '，', '绿', '水', '长', '流', '，', '我', '们', '月', '来', '客', '栈', '见', '！']
['10', '年', '前', '的', '今', '天', '，', '纪', '念', '5', '.', '12', '汶', '川', '大', '地', '震', '10', '周', '年']



**I used bert-base-chinese model from HuggingFace,This model has been pre-trained for Chinese, training and random input masking has been applied independently to word pieces (as in the original BERT paper).Here is a glimpse at the dictionnary:{word:index}**


In [3]:
from utils.data_helpers import Vocab,build_vocab
import itertools
vocab=build_vocab("bert_base_chinese/vocab.txt")
def get_first_100_items(dictionary):
    return dict(itertools.islice(dictionary.items(), 1000))

example_dict = {i: f"value_{i}" for i in range(200)}
first_1000_items = get_first_100_items(vocab.stoi)
print(first_1000_items)

{'[PAD]': 0, '[unused1]': 1, '[unused2]': 2, '[unused3]': 3, '[unused4]': 4, '[unused5]': 5, '[unused6]': 6, '[unused7]': 7, '[unused8]': 8, '[unused9]': 9, '[unused10]': 10, '[unused11]': 11, '[unused12]': 12, '[unused13]': 13, '[unused14]': 14, '[unused15]': 15, '[unused16]': 16, '[unused17]': 17, '[unused18]': 18, '[unused19]': 19, '[unused20]': 20, '[unused21]': 21, '[unused22]': 22, '[unused23]': 23, '[unused24]': 24, '[unused25]': 25, '[unused26]': 26, '[unused27]': 27, '[unused28]': 28, '[unused29]': 29, '[unused30]': 30, '[unused31]': 31, '[unused32]': 32, '[unused33]': 33, '[unused34]': 34, '[unused35]': 35, '[unused36]': 36, '[unused37]': 37, '[unused38]': 38, '[unused39]': 39, '[unused40]': 40, '[unused41]': 41, '[unused42]': 42, '[unused43]': 43, '[unused44]': 44, '[unused45]': 45, '[unused46]': 46, '[unused47]': 47, '[unused48]': 48, '[unused49]': 49, '[unused50]': 50, '[unused51]': 51, '[unused52]': 52, '[unused53]': 53, '[unused54]': 54, '[unused55]': 55, '[unused56]': 5

2. **Special Tokens**: Prepend `[CLS]`(with index 101) at the beginning of each sequence to aggregate sentence-level features, and append `[SEP]`(with index 102) at the end to signal input boundaries.
3. **Token-to-ID Mapping**: Convert each token into its corresponding integer `token_id` using the vocabulary (`vocab.txt`), forming the input ID sequence.

In [4]:
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import BertTokenizer
from Tasks.TaskForSingleSentenceClassification import BertConfig

class Load1SingleSentenceClassificationDataset:
    def __init__(self,
                 vocab_path='bert_base_chinese/vocab.txt',  #
                 tokenizer=None,
                 batch_size=32,
                 max_sen_len=None,
                 split_sep='_!_',
                 max_position_embeddings=512,
                 pad_index=0,
                 is_sample_shuffle=True
                 ):
        model_config = ModelConfig()
        self.tokenizer = BertTokenizer.from_pretrained(model_config.pretrained_model_dir).tokenize
        self.vocab = build_vocab(vocab_path)
        self.PAD_IDX = pad_index
        self.SEP_IDX = self.vocab['[SEP]']
        self.CLS_IDX = self.vocab['[CLS]']
        self.batch_size = batch_size
        self.split_sep = split_sep
        self.max_position_embeddings = max_position_embeddings
        if isinstance(max_sen_len, int) and max_sen_len > max_position_embeddings:
            max_sen_len = max_position_embeddings
        self.max_sen_len = max_sen_len
        self.is_sample_shuffle = is_sample_shuffle
    def data_process(self, file_path=None):
        """
        """
        raw_iter = open(file_path, encoding="utf8").readlines()
        data = []
        max_len = 0
        for raw in tqdm(raw_iter, ncols=80):
            line = raw.rstrip("\n").split(self.split_sep)
            print(line)
            s, l = line[0], line[1]
            tmp = [self.CLS_IDX] + [self.vocab[token] for token in self.tokenizer(s)]
            if len(tmp) > self.max_position_embeddings - 1:
                tmp = tmp[:self.max_position_embeddings - 1]  # BERT预训练模型只取前512个字符
            tmp += [self.SEP_IDX]
            tensor_ = torch.tensor(tmp, dtype=torch.long)
            l = torch.tensor(int(l), dtype=torch.long)
            max_len = max(max_len, tensor_.size(0))
            data.append((tensor_))
        return data

In [5]:
sentence_class=Load1SingleSentenceClassificationDataset()
data=sentence_class.data_process("data/SingleSentenceClassification/jupyter_test.txt")
data

100%|███████████████████████████████████████████| 6/6 [00:00<00:00, 2997.00it/s]

['张艺兴黄金瞳片场，导演能给个合适的帽子不？', '2']
['故宫如何修文物？文物医院下月向公众开放', '1']
['深圳房价是沈阳6倍就是因为经济？错！', '5']
['不负春光，樱花树下；温暖你我，温暖龙岩', '10']
['二胡，如何对？', '2']
['轻松一刻：带你看全球最噩梦监狱，每天进几百人，审讯时已过几年', '11']





[tensor([ 101, 2476, 5686, 1069, 7942, 7032, 4749, 4275, 1767, 8024, 2193, 4028,
         5543, 5314,  702, 1394, 6844, 4638, 2384, 2094,  679, 8043,  102]),
 tensor([ 101, 3125, 2151, 1963,  862,  934, 3152, 4289, 8043, 3152, 4289, 1278,
         7368,  678, 3299, 1403, 1062,  830, 2458, 3123,  102]),
 tensor([ 101, 3918, 1766, 2791,  817, 3221, 3755, 7345,  127,  945, 2218, 3221,
         1728,  711, 5307, 3845, 8043, 7231, 8013,  102]),
 tensor([ 101,  679, 6566, 3217, 1045, 8024, 3569, 5709, 3409,  678, 8039, 3946,
         3265,  872, 2769, 8024, 3946, 3265, 7987, 2272,  102]),
 tensor([ 101,  753, 5529, 8024, 1963,  862, 2190, 8043,  102]),
 tensor([ 101, 6768, 3351,  671, 1174, 8038, 2372,  872, 4692, 1059, 4413, 3297,
         1691, 3457, 4664, 4328, 8024, 3680, 1921, 6822, 1126, 4636,  782, 8024,
         2144, 6380, 3198, 2347, 6814, 1126, 2399,  102])]

4. **Truncation & Padding & Padding Mask**:
   - **Truncation**: If the sequence length exceeds `max_position_embeddings - 1`, truncate to fit the position embedding limit.
   - **Padding**: If the sequence is shorter, pad with `pad_index` tokens so that all sequences in a batch share the same length.
   - **Padding Mask**:After obtaining `input_ids`, generate a binary mask `attention_mask = (input_ids == loader.PAD_IDX)` where `1` indicates padded positions (to be masked out) and `0` indicates valid tokens. This mask ensures the model ignores padding during attention computations.

In [6]:
def pad_sequence(sequences, batch_first=False, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max([s.size(0) for s in sequences])
    out_tensors = []
    for tensor in sequences:
        if tensor.size(0) < max_len:
            tensor = torch.cat([tensor, torch.tensor([padding_value] * (max_len - tensor.size(0)))], dim=0)
        else:
            tensor = tensor[:max_len]
        out_tensors.append(tensor)
    out_tensors = torch.stack(out_tensors, dim=1)
    if batch_first:
        return out_tensors.transpose(0, 1)
    return out_tensors

In [7]:
pad=pad_sequence(data, max_len=None).transpose(1,0)
pad

tensor([[ 101, 2476, 5686, 1069, 7942, 7032, 4749, 4275, 1767, 8024, 2193, 4028,
         5543, 5314,  702, 1394, 6844, 4638, 2384, 2094,  679, 8043,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 3125, 2151, 1963,  862,  934, 3152, 4289, 8043, 3152, 4289, 1278,
         7368,  678, 3299, 1403, 1062,  830, 2458, 3123,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 3918, 1766, 2791,  817, 3221, 3755, 7345,  127,  945, 2218, 3221,
         1728,  711, 5307, 3845, 8043, 7231, 8013,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101,  679, 6566, 3217, 1045, 8024, 3569, 5709, 3409,  678, 8039, 3946,
         3265,  872, 2769, 8024, 3946, 3265, 7987, 2272,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101,  753, 5529, 8024, 1963,  862, 2190, 8043,  102,    0,    0,    0,
            0,    0,    0,    0,    0, 

In [8]:
pad_list=[]
for sample in pad_sequence(data, max_len=None).transpose(1,0):
    print(sample.shape)  # [seq_len,batch_size]
    padding_mask = (sample == sentence_class.PAD_IDX)
    pad_list.append(padding_mask)
mask = torch.stack(pad_list, dim=0)
mask

torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])


tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False,  True,  True,  True,  True,  True,  True,  True,
          True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False,  True,  True,  True,  True,  True,  True,  

## Embedding
### TokenEmbedding
This cell validates the **TokenEmbedding** layer, which is the first step in BERT’s embedding pipeline. Key points:

1. **Embedding Matrix**: BERT’s token embedding is a learnable matrix of shape `[src_len, hidden_size]`,where hidden_size is 768 here. Each row corresponds to a token’s embedding vector, initialized randomly and refined through pre-training.
2. **Subword Inputs**: Inputs to this layer are token IDs generated by WordPiece tokenization, where rare or unknown words are split into subword units or mapped to `[UNK]`.
3. **Lookup Operation**: Given `src` tensor of shape `[src_len, batch_size]`, the layer performs a lookup for each ID, producing a dense representation of shape `[src_len, batch_size, hidden_size]`.

In [9]:
from model import TokenEmbedding

json_file = 'bert_base_chinese/config.json'
config = BertConfig.from_json_file(json_file)
src = pad
src = src.transpose(0, 1)  # [src_len, batch_size]

token_embedding = TokenEmbedding(vocab_size=config.vocab_size, hidden_size=config.hidden_size)
t_embedding = token_embedding(input_ids=src)
print("***** --------- test TokenEmbedding ------------")
print("input_token shape [src_len,batch_size]: ", src.shape)
print(f"input_token embedding shape [src_len,batch_size,hidden_size]: {t_embedding.shape}\n")
print(t_embedding)

***** --------- test TokenEmbedding ------------
input_token shape [src_len,batch_size]:  torch.Size([32, 6])
input_token embedding shape [src_len,batch_size,hidden_size]: torch.Size([32, 6, 768])

tensor([[[ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066],
         [ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066],
         [ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066],
         [ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066],
         [ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066],
         [ 0.0075,  0.0029, -0.0010,  ...,  0.0143, -0.0421, -0.0066]],

        [[-0.0256, -0.0010, -0.0138,  ...,  0.0517, -0.0053, -0.0330],
         [-0.0130,  0.0355, -0.0006,  ...,  0.0020,  0.0105, -0.0442],
         [ 0.0239, -0.0121,  0.0064,  ..., -0.0078,  0.0319,  0.0045],
         [ 0.0158,  0.0094,  0.0117,  ...,  0.0343, -0.0256, -0.0114],
         [ 0.0127, -0.0028,  0.0413,  ..., -0.0143, -0.0518, -0.0062],
         [-0.0022, 

### Positional Embedding

BERT uses **learned positional embeddings** to encode token order, differing from fixed sinusoidal methods. The core concepts are:

1. **Embedding Matrix**
   - Shape: `[max_position_embeddings, hidden_size]`.
   - Rows correspond to position indices from `0` to `max_position_embeddings-1`, each initialized randomly and fine-tuned during pre-training.

2. **Position ID Tensor**
   - Generated by `torch.arange(src_len).unsqueeze(0)`, producing a tensor of shape `[1, src_len]`.
   - Assigns each token an absolute position index for lookup.

3. **Lookup Operation**
   - Feeding `position_ids` into the `PositionalEmbedding` layer returns a tensor of shape `[src_len, 1, hidden_size]`.
   - Each vector aligns with its corresponding token across the sequence.

In [10]:
from model import PositionalEmbedding

position_ids = torch.arange(pad[0].size()[0]).expand((1, -1))
print(position_ids)
pos_embedding = PositionalEmbedding(max_position_embeddings=32,
                                        hidden_size=8)
p_embedding = pos_embedding(position_ids=position_ids)
    # print(pos_embedding.embedding.weight)  # embedding 矩阵
    # print(p_embedding)  # positional embedding 结果,
print("***** --------- test PositionalEmbedding ------------")
print("position_ids shape [1,src_len]: ", position_ids.shape)
print(f"pos embedding shape [src_len, 1, hidden_size]: {p_embedding.shape}\n")
print(p_embedding)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]])
***** --------- test PositionalEmbedding ------------
position_ids shape [1,src_len]:  torch.Size([1, 32])
pos embedding shape [src_len, 1, hidden_size]: torch.Size([32, 1, 8])

tensor([[[ 3.5126e-02,  1.9238e-02, -9.9565e-03, -1.0535e-02,  1.3769e-02,
           5.3469e-03,  2.4950e-03,  1.0606e-02]],

        [[ 4.8951e-03,  2.1725e-02,  3.0443e-02, -6.9817e-03, -2.5306e-02,
           2.9762e-03, -2.7192e-02,  1.2896e-02]],

        [[-3.0778e-02, -1.6687e-02,  2.6079e-03,  1.5348e-02, -2.4240e-02,
           8.0362e-03,  4.1460e-02, -7.2073e-03]],

        [[ 1.0647e-03,  2.9670e-02, -2.1668e-02, -2.0692e-04, -4.9818e-03,
          -1.6159e-03, -4.8928e-03, -1.6830e-02]],

        [[-2.2703e-02, -5.7539e-03, -1.1352e-02, -2.1815e-02,  2.0396e-03,
           1.6219e-02, -1.2897e-02,  8.0958e-03]],

        [[-5.4971e-03,  9.7179e-03,  1.

### Embeddings Intergration
The `BertEmbeddings` class integrates three learnable embedding modules to produce the input representations for BERT’s Transformer encoder:

1. **Token Embedding**
   - Learns a mapping from token IDs to dense vectors of shape `[src_len, hidden_size]`.
   - Captures semantic information of individual subword tokens from pre-training.

2. **Positional Embedding**
   - A learnable matrix of shape `[src_len, hidden_size]`.
   - Encodes absolute token positions, enabling the model to distinguish order without recurrence.

3. **Segment (Token Type) Embedding**
   - Differentiates sentence segments (e.g., Sentence A vs. Sentence B) in tasks requiring pair inputs.
   - Due to the single sentence task,I set this to all 0.

**Embedding Fusion**
- For each token position `i` in a batch, the final embedding is computed as:
  ```text
  E[i] = TokenEmb(src_id[i])
       + PosEmb(position_id[i])
       + SegEmb(token_type_id[i])

In [11]:
json_file = 'bert_base_chinese/config.json'
config = BertConfig.from_json_file(json_file)
config.__dict__['use_torch_multi_head'] = True
config.max_position_embeddings = 518
src = pad
src = src.transpose(0, 1)  # [src_len, batch_size]
print(f"input shape [src_len,batch_size]: ", src.shape)
token_type_ids = torch.where(pad == 0, torch.zeros_like(pad), torch.zeros_like(pad)).transpose(0,1)
attention_mask = mask
#
# # ------ BertEmbedding -------
bert_embedding = BertEmbeddings(config)
bert_embedding_result = bert_embedding(src, token_type_ids=token_type_ids)
bert_embedding_result

input shape [src_len,batch_size]:  torch.Size([32, 6])


tensor([[[ 0.4861, -1.6001, -0.5305,  ..., -1.5493,  0.3996,  0.6168],
         [ 0.0000, -1.6001, -0.5305,  ..., -1.5493,  0.0000,  0.0000],
         [ 0.4861, -1.6001, -0.5305,  ..., -1.5493,  0.3996,  0.6168],
         [ 0.4861, -1.6001, -0.5305,  ..., -0.0000,  0.3996,  0.6168],
         [ 0.4861, -1.6001, -0.5305,  ..., -1.5493,  0.3996,  0.6168],
         [ 0.4861, -1.6001, -0.5305,  ..., -1.5493,  0.3996,  0.6168]],

        [[ 0.0363,  0.5027, -0.4244,  ...,  0.8577,  1.0641,  0.1094],
         [-0.4226, -1.8004,  0.2319,  ...,  0.2035,  0.5910, -0.3899],
         [-0.3888, -1.3964,  0.2576,  ...,  1.4695,  2.3333,  0.3745],
         [ 0.8243, -0.9381,  0.7733,  ...,  0.6079,  1.6117,  0.1390],
         [ 0.9745, -0.1452,  1.0488,  ..., -0.3485,  3.7819,  0.0000],
         [ 0.9964,  0.1232,  1.9226,  ...,  0.2989,  1.4622, -0.0000]],

        [[ 0.7146, -0.2857, -1.1186,  ..., -0.5450,  1.4823, -0.8822],
         [ 0.3286, -1.5011, -0.8225,  ...,  0.0000,  1.6215,  1.0085],
  

# BERT Components
This section validates BERT’s core Transformer blocks by testing self-attention, single-layer, and full encoder stack. Key BERT-specific insights:

1. **Multi-Head Self-Attention (`BertAttention`)**
   - **Query/Key/Value Projections**: Inputs of shape `[src_len, batch_size, hidden_size]` are linearly projected into Q, K, V tensors for each of the `num_attention_heads` heads.
   - **Scaled Dot-Product**: Each head computes attention weights via `softmax((Q·K^T) / sqrt(d_k))`, focusing on context tokens.
   - **Head Concatenation**: Outputs from all heads are concatenated and passed through a final linear layer, then added back to the input via a residual connection.
   - **LayerNorm & Dropout**: Normalizes post-attention sums and applies dropout for regularization.

2. **Transformer Block (`BertLayer`)**
   - **Attention Sub-Layer**: As above, produces contextualized embeddings.
   - **Feed-Forward Sub-Layer**: Two-layer MLP (`BertIntermediate` + `BertOutput`) with activation (GELU) projects to `intermediate_size` and back to `hidden_size`.
   - **Residual & LayerNorm**: Each sub-layer uses a residual connection followed by LayerNorm to preserve gradients and stabilize training.

3. **Encoder Stack (`BertEncoder`)**
   - **Layer Stacking**: BERT stacks `num_hidden_layers` identical `BertLayer` modules (e.g., 12 layers in base configuration).
   - **Hidden States Output**: Returns outputs of each layer, enabling introspection of hidden representations at various depths.

**Practical Implications**
- Multi-head attention enables BERT to jointly attend to information from different representation subspaces at different positions.
- Deep stacking captures hierarchical linguistic features: lower layers model local syntax, higher layers abstract semantics.
- Residual connections and LayerNorm ensure stable gradient flow and effective pre-training over long sequences.

In [12]:
bert_attention = BertAttention(config)
bert_attention_output = bert_attention(bert_embedding_result, attention_mask=attention_mask)
print(f"BertAttention output shape [src_len, batch_size, hidden_size]: ", bert_attention_output.shape)

bert_layer = BertLayer(config)
bert_layer_output = bert_layer(bert_embedding_result, attention_mask)

bert_encoder = BertEncoder(config)
bert_encoder_outputs = bert_encoder(bert_embedding_result, attention_mask)
print(f"num of BertEncoder [config.num_hidden_layers]: ", len(bert_encoder_outputs))
print(f"each output shape in BertEncoder [src_len, batch_size, hidden_size]: ", bert_encoder_outputs[0].shape)
print(bert_encoder_outputs)

BertAttention output shape [src_len, batch_size, hidden_size]:  torch.Size([32, 6, 768])
num of BertEncoder [config.num_hidden_layers]:  12
each output shape in BertEncoder [src_len, batch_size, hidden_size]:  torch.Size([32, 6, 768])
[tensor([[[ 4.7121e-01, -1.3970e+00, -6.5509e-01,  ..., -1.3581e+00,
           1.1114e-01,  8.1716e-01],
         [ 5.8348e-02, -1.2759e+00, -4.4585e-01,  ..., -1.2719e+00,
          -2.0997e-01,  1.4221e-01],
         [ 2.2763e-01, -1.2297e+00, -5.5650e-01,  ..., -1.1705e+00,
           1.6773e-01,  6.3508e-01],
         [ 5.2786e-02, -1.3453e+00, -6.5582e-01,  ...,  3.0606e-01,
           1.3006e-01,  7.3074e-01],
         [ 4.6505e-01, -9.9038e-01, -4.2832e-01,  ..., -1.4094e+00,
           7.8336e-02,  7.2582e-01],
         [ 2.5764e-01, -1.3428e+00, -6.3295e-01,  ..., -1.0395e+00,
          -1.0489e-02,  3.8393e-01]],

        [[-2.0534e-01,  4.5511e-01, -6.9092e-02,  ...,  1.1451e+00,
           8.4206e-01,  4.7748e-01],
         [-7.5937e-01, -1.6

**This snippet demonstrates how BERT’s final pooled representations are fed into the classification head, and how to visualize the model graph with TensorBoard:**

In [13]:
from torch.utils.tensorboard import SummaryWriter
from model import BertForSentenceClassification
config.__dict__['num_labels'] = 16
config.__dict__['num_hidden_layers'] = 3
model = BertForSentenceClassification(config)

input_ids = src
attention_mask =mask  # [batch_size,src_len]
logits = model(input_ids=input_ids,
                   attention_mask=attention_mask)
print(logits.shape)
writer = SummaryWriter('./runs')
writer.add_graph(model, input_ids)

torch.Size([6, 16])


In [14]:
%load_ext tensorboard

In [15]:
%tensorboard --logdir runs --port 6006

Launching TensorBoard...

## Pretrained model loading
### My BERT Model Parameters Inspection
This code snippet loads the BERT configuration and instantiates the `BertModel` to inspect its learnable parameters:

1. **Configuration Loading**
   - Reads hyperparameters such as `hidden_size`, `num_hidden_layers`, and `vocab_size` from `config.json`.
2. **Model Initialization**
   - Calls `BertModel(config)` to build the full model architecture, including embeddings, encoder layers, and pooler.
3. **Parameter Enumeration**
   - `state_dict()` returns a mapping of parameter names to tensors (weights and biases).
   - `len(state_dict)` gives the total count of parameter tensors in the model.
   - Iterating through `state_dict.items()` prints each parameter’s name and its shape, providing insight into layer dimensions and verifying alignment with BERT’s base configuration.

This inspection is useful for debugging custom implementations and ensuring that parameter counts and shapes match expected values.

In [16]:
json_file = 'bert_base_chinese/config.json'
config = BertConfig.from_json_file(json_file)
bert_model = BertModel(config)
print("\n  =======  MyBert Parameters: ========")
print(len(bert_model.state_dict()))
for param_tensor in bert_model.state_dict():
    print(param_tensor, "\t", bert_model.state_dict()[param_tensor].size())


200
bert_embeddings.position_ids 	 torch.Size([1, 512])
bert_embeddings.word_embeddings.embedding.weight 	 torch.Size([21128, 768])
bert_embeddings.position_embeddings.embedding.weight 	 torch.Size([512, 768])
bert_embeddings.token_type_embeddings.embedding.weight 	 torch.Size([2, 768])
bert_embeddings.LayerNorm.weight 	 torch.Size([768])
bert_embeddings.LayerNorm.bias 	 torch.Size([768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.q_proj.weight 	 torch.Size([768, 768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.q_proj.bias 	 torch.Size([768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.k_proj.weight 	 torch.Size([768, 768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.k_proj.bias 	 torch.Size([768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.v_proj.weight 	 torch.Size([768, 768])
bert_encoder.bert_layers.0.bert_attention.self.multi_head_attention.v_proj.bias 	 torc

### Pretrained BERT Model Parameters Inspection
This code snippet loads the bert-base chinese BERT configuration to inspect its learnable parameters:

1. **Configuration Loading**
   - Reads hyperparameters such as `hidden_size`, `num_hidden_layers`, and `vocab_size` from `/pytorch_model.bin`.
   - The parameter pytorch_model.bin is loaded as an ordered dictionary OrderedDict, and there are 207 parameters whose names are the elements of the list.
2. **Comparison**
   - My Bert has a total of 200 parameters, while bert-base-chinese has a total of 207 parameters.It should be noted here that the parameter position_ids in my bert model is not a parameter that needs to be trained in the model, it is just a default initial value.Finally, after analysing (comparing the two one by one), we found that the 199 parameters in bert-base-chinese are the same as the 199 parameters in my bert model and in the same order, except for the last 8 parameters.


In [17]:
loaded_paras = torch.load('bert_base_chinese/pytorch_model.bin')
print(type(loaded_paras))
print(len(list(loaded_paras.keys())))
print(list(loaded_paras.keys()))

<class 'collections.OrderedDict'>
207
['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.gamma', 'bert.embeddings.LayerNorm.beta', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.gamma', 'bert.encoder.layer.0.attention.output.LayerNorm.beta', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.gamma', 'ber

## Pretrained Parameter Loading (`from_pretrained`)
This code illustrates how BERT’s pretrained weights are loaded into a custom model implementation while handling differences in attention mechanics and position embedding sizes:

1. **Model Instantiation**
   - `cls(config)` creates a fresh `BertModel` with randomly initialized parameters based on the given `config`.
2. **Checkpoint Path Resolution**
   - Constructs `pytorch_model.bin` path under `pretrained_model_dir`. Raises an error if the file is missing.
3. **Parameter Extraction**
   - Loads the binary checkpoint into `loaded_paras`, containing state dictionaries from the original BERT pretraining.
   - Separates out the last pooler parameters (skipped via `[:-8]`).
4. **Torch Multi-Head Handling**
   - If `config.use_torch_multi_head` is `True`, merges separate query/key/value weight tensors into one concatenated tensor via `format_paras_for_torch` to match `nn.MultiheadAttention` expectations.
5. **Position Embedding Extension**
   - Checks for `position_embeddings` parameters. If `max_position_embeddings > 512`, applies `replace_512_position` to extend the embedding table beyond the original 512 tokens, preserving pretrained values.
6. **State Dict Assignment**
   - Iterates through the custom model’s parameter names, assigning each with the corresponding tensor from `loaded_paras` or the processed `torch_paras`.
7. **Finalization**
   - Loads the assembled `state_dict` into the model via `load_state_dict`, completing the injection of pretrained weights.

This procedure ensures compatibility between pretrained BERT checkpoints and custom implementations, even when extending sequence length or switching attention backends.

In [18]:
from copy import deepcopy
import os
from model.BasicBert.Bert import format_paras_for_torch, replace_512_position


def from_pretrained(cls, config, pretrained_model_dir=None):
    model = cls(config)  # 初始化模型，cls为未实例化的对象，即一个未实例化的BertModel对象
    pretrained_model_path = os.path.join(pretrained_model_dir, "pytorch_model.bin")
    if not os.path.exists(pretrained_model_path):
        raise ValueError()
    loaded_paras = torch.load(pretrained_model_path,weights_only=True)
    state_dict = deepcopy(model.state_dict())
    loaded_paras_names = list(loaded_paras.keys())[:-8]
    model_paras_names = list(state_dict.keys())[1:]
    if 'use_torch_multi_head' in config.__dict__ and config.use_torch_multi_head:
        torch_paras = format_paras_for_torch(loaded_paras_names, loaded_paras)
        for i in range(len(model_paras_names)):
            if "position_embeddings" in model_paras_names[i]:
                if config.max_position_embeddings > 512:
                    new_embedding = replace_512_position(state_dict[model_paras_names[i]],
                                                             loaded_paras[loaded_paras_names[i]])
                    state_dict[model_paras_names[i]] = new_embedding
                    continue
            state_dict[model_paras_names[i]] = torch_paras[i]
    else:
        for i in range(len(loaded_paras_names)):
            if "position_embeddings" in model_paras_names[i]:
                if config.max_position_embeddings > 512:
                    new_embedding = replace_512_position(state_dict[model_paras_names[i]],
                                                             loaded_paras[loaded_paras_names[i]])
                    state_dict[model_paras_names[i]] = new_embedding
                    continue
            state_dict[model_paras_names[i]] = loaded_paras[loaded_paras_names[i]]

    model.load_state_dict(state_dict)
    return model
print(f"\n  =======  test BertModel pretrained： ========")
model = BertModel.from_pretrained(config, pretrained_model_dir="bert_base_chinese")
model




BertModel(
  (bert_embeddings): BertEmbeddings(
    (word_embeddings): TokenEmbedding(
      (embedding): Embedding(21128, 768)
    )
    (position_embeddings): PositionalEmbedding(
      (embedding): Embedding(512, 768)
    )
    (token_type_embeddings): SegmentEmbedding(
      (embedding): Embedding(2, 768)
    )
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (bert_encoder): BertEncoder(
    (bert_layers): ModuleList(
      (0-11): 12 x BertLayer(
        (bert_attention): BertAttention(
          (self): BertSelfAttention(
            (multi_head_attention): MyMultiheadAttention(
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
          )

## Semantic loss
This function computes an additional loss to guide the model's learning of word semantics,using external knowledge from HowNet (a Chinese lexical resource). The goal is to encourage words with similar meanings (synonyms) to have similar representations, and words with opposite meanings (antonyms) to have dissimilar representations.
The semantic loss consists of two parts:
   1. **Synonym Loss**: Uses Mean Squared Error (MSE) to minimize the difference between the
       embeddings of synonyms, encouraging them to have similar representations.
   2. **Antonym Loss**: Uses margin-based loss to increase the distance between the embeddings
       of antonyms, encouraging them to have distinct representations.

In [19]:
import torch
import torch.nn.functional as F
def compute_semantic_loss(input_ids, hidden_states, vocab, lexicon, lamda=0.1, margin=1.0):

    # Get the shape of hidden_states: [seq_len, batch_size, hidden_size]
    seq_len, batch_size, H = hidden_states.shape
    loss_syn, loss_ant = 0.0, 0.0  # Initialize the synonym and antonym loss
    count_syn, count_ant = 0, 0  # Initialize the counters for synonyms and antonyms

    # Transpose hidden_states to [batch_size, seq_len, hidden_size] for easy indexing
    hs = hidden_states.transpose(0, 1)

    # Iterate through the batch (each example)
    for b in range(batch_size):
        tokens = [vocab.itos[i] for i in input_ids[:, b].tolist()]  # Get the tokens for this example

        # Iterate through each token to find synonyms and antonyms
        for i, tok in enumerate(tokens):
            # Get the list of synonyms and antonyms for the current token from the lexicon
            syns = lexicon.get_synonyms(tok)
            ants = lexicon.get_antonyms(tok)

            if syns:
                # For each synonym, calculate the Mean Squared Error (MSE) loss between the token and the synonym's embeddings
                for j, tok2 in enumerate(tokens):
                    if tok2 in syns:  # If the token is a synonym
                        v1, v2 = hs[b, i], hs[b, j]  # Get the embeddings for the token and its synonym
                        loss_syn += F.mse_loss(v1, v2)  # Add the MSE loss to the total synonym loss
                        count_syn += 1  # Count the number of synonym pairs
                        break  # Only consider the first matching synonym in the sentence

            if ants:
                # For each antonym, calculate the margin-based loss to increase the distance between antonyms' embeddings
                for j, tok2 in enumerate(tokens):
                    if tok2 in ants:  # If the token is an antonym
                        v1, v2 = hs[b, i], hs[b, j]  # Get the embeddings for the token and its antonym
                        dist = F.pairwise_distance(v1.unsqueeze(0), v2.unsqueeze(0))
                        loss_ant += torch.clamp(margin - dist, min=0.0).mean()  # Add the margin loss to the antonym loss
                        count_ant += 1  # Count the number of antonym pairs
                        break  # Only consider the first matching antonym in the sentence

    # Normalize the loss by the number of synonym and antonym pairs found
    if count_syn > 0:
        loss_syn = loss_syn / count_syn
    if count_ant > 0:
        loss_ant = loss_ant / count_ant

    # Return the weighted sum of synonym and antonym losses
    return lamda * (loss_syn + loss_ant)

## Training and Evaluation Routines
This section defines two core functions for model training and validation in the single-sentence classification task:

1. **Evaluation Function** (`evaluate`):
   - Switches model to inference mode with `model.eval()`, disabling dropout and gradient computation.
   - Iterates over `data_iter`, moves inputs (`x`) and labels (`y`) to the specified `device`.
   - Constructs a **padding mask** via `(x == PAD_IDX).transpose(0, 1)`, marking padded positions so they do not affect attention.
   - Computes `logits = model(x, attention_mask=padding_mask)`, selects the highest-scoring classes, and accumulates correct predictions.
   - Returns overall accuracy as `acc_sum / n`.

2. **Training Function** (`train`):
   - Instantiates `BertForSentenceClassification` with `config` and optionally loads an existing checkpoint from `model_save_path`.
   - Moves the model to `device` and sets it to training mode (`model.train()`).
   - Initializes an `Adam` optimizer on model parameters with a learning rate of 5e-5.
   - Creates a `LoadSingleSentenceClassificationDataset` loader and obtains `train_iter`, `val_iter`, and `test_iter`.
   - Loops over epochs:
     - For each batch, moves data to `device`, builds the padding mask, and calls `model(..., labels=label)`, which returns `(loss, logits)`.
     - Performs backpropagation (`loss.backward()`) and optimization step (`optimizer.step()`).
     - Accumulates batch losses to compute the epoch’s average loss.
   - Every `config.model_val_per_epoch` epochs, calls `evaluate(val_iter, ...)` to compute validation accuracy; if improved, saves the model state.

These routines encapsulate the full training lifecycle, from data loading and forward/backward passes to checkpointing based on validation performance.

#### PS:The training time is too long to here,please go to Task/TaskForSingleSentenceClassification.py,.pt model has saved to /cache

In [20]:
from model.HowNet import HowNetLexicon
import logging
import time
from utils import LoadSingleSentenceClassificationDataset
from model import BertForSentenceClassification
lexicon = HowNetLexicon()
λ_sem = 0.2
def evaluate(data_iter, model, device, PAD_IDX):
    model.eval()
    with torch.no_grad():
        acc_sum, n = 0.0, 0
        for x, y in data_iter:
            x, y = x.to(device), y.to(device)
            padding_mask = (x == PAD_IDX).transpose(0, 1)
            logits = model(x, attention_mask=padding_mask)
            acc_sum += (logits.argmax(1) == y).float().sum().item()
            n += len(y)
        model.train()
        return acc_sum / n
def train(config,λ_sem=0.1):
    model = BertForSentenceClassification(config,bert_pretrained_model_dir=None)
    model_save_path = os.path.join(config.model_save_dir, 'model.pt')
    if os.path.exists(model_save_path):
        loaded_paras = torch.load(model_save_path,map_location=torch.device('cpu'))
        model.load_state_dict(loaded_paras)
        logging.info("## load model......")
    model = model.to(config.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    model.train()
    bert_tokenize = BertTokenizer.from_pretrained(config.pretrained_model_dir).tokenize
    data_loader = LoadSingleSentenceClassificationDataset(vocab_path=config.vocab_path,
                                                          tokenizer=bert_tokenize,
                                                          batch_size=config.batch_size,
                                                          max_sen_len=config.max_sen_len,
                                                          split_sep=config.split_sep,
                                                          max_position_embeddings=config.max_position_embeddings,
                                                          pad_index=config.pad_token_id,
                                                          is_sample_shuffle=config.is_sample_shuffle)
    train_iter, test_iter, val_iter = data_loader.load_train_val_test_data(config.train_file_path,
                                                                           config.val_file_path,
                                                                           config.test_file_path)
    max_acc = 0
    for epoch in range(config.epochs):
        losses = 0
        start_time = time.time()
        for idx, (sample, label) in enumerate(train_iter):
            sample = sample.to(config.device)  # [src_len, batch_size]
            label = label.to(config.device)
            padding_mask = (sample == data_loader.PAD_IDX).transpose(0, 1)
            loss, logits = model(
                input_ids=sample,
                attention_mask=padding_mask,
                token_type_ids=None,
                position_ids=None,
                labels=label)
            sem_loss = compute_semantic_loss(
                input_ids=sample,
                hidden_states=torch.zeros([512,200,768]),  # Last hidden layer [seq_len, batch_size, hidden_size]
                vocab=data_loader.vocab,
                lexicon=lexicon,
                lamda=λ_sem
            )
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses += loss.item()+sem_loss*λ_sem
            acc = (logits.argmax(1) == label).float().mean()
            if idx % 10 == 0:
                logging.info(f"Epoch: {epoch}, Batch[{idx}/{len(train_iter)}], "
                             f"Train loss :{loss.item():.3f}, Train acc: {acc:.3f}")
        end_time = time.time()
        train_loss = losses / len(train_iter)
        logging.info(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")
        if (epoch + 1) % config.model_val_per_epoch == 0:
            acc = evaluate(val_iter, model, config.device, data_loader.PAD_IDX)
            logging.info(f"Accuracy on val {acc:.3f}")
            if acc > max_acc:
                max_acc = acc
                torch.save(model.state_dict(), model_save_path)

## Prediction

In [21]:
from transformers import BertTokenizer,pipeline
_mapping_txt = """\
100 民生 故事 news_story
101 文化 文化 news_culture
102 娱乐 娱乐 news_entertainment
103 体育 体育 news_sports
104 财经 财经 news_finance
106 房产 房产 news_house
107 汽车 汽车 news_car
108 教育 教育 news_edu
109 科技 科技 news_tech
110 军事 军事 news_military
112 旅游 旅游 news_travel
113 国际 国际 news_world
114 证券 股票 stock
115 农业 三农 news_agriculture
116 电竞 游戏 news_game
"""
_category_map = []
for line in _mapping_txt.splitlines():
    code, zh1, zh2, en = line.strip().split()
    _category_map.append({
        "code": code,
        "zh"  : zh1 + zh2,
        "en"  : en
    })
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

def predict_and_translate(sentence: str):
    config = ModelConfig()
    device = config.device

    tokenizer = BertTokenizer.from_pretrained(config.pretrained_model_dir)
    loader = LoadSingleSentenceClassificationDataset(
        vocab_path=config.vocab_path,
        tokenizer=tokenizer.tokenize,
        batch_size=1,
        max_sen_len=config.max_sen_len,
        split_sep=config.split_sep,
        max_position_embeddings=config.max_position_embeddings,
        pad_index=config.pad_token_id,
        is_sample_shuffle=False
    )

    tokens = tokenizer.tokenize(sentence)
    ids = [loader.CLS_IDX] + [
        loader.vocab.stoi.get(t, loader.vocab.stoi[loader.vocab.UNK])
        for t in tokens
    ] + [loader.SEP_IDX]
    max_len = loader.max_position_embeddings - 1
    if len(ids) > max_len + 1:
        ids = ids[:max_len] + [loader.SEP_IDX]

    tensor = torch.tensor(ids, dtype=torch.long)
    batch_sent, _ = loader.generate_batch([(tensor, 0)])
    batch_sent = batch_sent.to(device)
    attention_mask = (batch_sent == loader.PAD_IDX).transpose(0, 1)
    model = BertForSentenceClassification(config, config.pretrained_model_dir)
    state_dict = torch.load(
        os.path.join(config.model_save_dir, 'model.pt'),
        map_location=device
    )
    model.load_state_dict(state_dict)
    model.to(device).eval()

    with torch.no_grad():
        logits = model(batch_sent, attention_mask=attention_mask)
        idx = logits.argmax(dim=1).item()
    item = _category_map[idx]
    class_result = f"{item['zh']} （{item['en']}，code={item['code']}）"

    translation = translator(sentence, max_length=256)[0]["translation_text"]

    return class_result, translation

Device set to use cpu


In [22]:
s = input("article title: \n")
print("Title is",s)
print("Prediction and translation", predict_and_translate(s))

Title is 
Prediction and translation ('文化文化 （news_culture，code=101）', "I don't think so.")
