In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import torch
import torch.optim as optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle

from transformers import (
    AutoModelForTokenClassification, 
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments, 
    Trainer,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import evaluate

In [None]:
def load_and_prepare_data():
    """加载并预处理数据集"""
    # 加载数据集
    ds = load_dataset("doushabao4766/msra_ner_k_V3")

    # 直接从数据集中提取已有标签（避免与原数据不一致）
    tag_list = [
        'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'
    ]
    # 提取实体类型 PER/ORG/LOC 及 O
    entity_types = ['O'] + sorted(list(set(tag.split('-')[-1] for tag in tag_list if tag != 'O')))

    entity_index = {entity: i for i, entity in enumerate(entity_types)}
    
    return ds, tag_list, entity_index


In [None]:
def process_entity_tags(ds, tag_list):
    """转换标签"""
    tag2id = {tag: i for i, tag in enumerate(tag_list)}
    
    def convert_tags(example):
        # 将标签字符串转换为对应数字
        example['labels'] = [tag2id[tag] for tag in example['ner_tags']]
        return example

    return ds.map(convert_tags)


In [None]:
def tokenize_data(ds, tokenizer):
    """使用 tokenizer 对 token 序列进行编码，并对标签进行对齐"""

    def tokenize_and_align_labels(example):
        # tokenizer 的输入是 tokens
        tokenized_input = tokenizer(
            example["tokens"],
            is_split_into_words=True,
            truncation=True,
            max_length=512,
            padding='max_length'
        )
        
        word_ids = tokenized_input.word_ids()
        aligned_labels = []
        label_ids = example["labels"]

        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # 用于忽略 loss
            else:
                aligned_labels.append(label_ids[word_id])

        tokenized_input["labels"] = aligned_labels
        return tokenized_input

    return ds.map(tokenize_and_align_labels)



In [None]:
def create_model_and_optimizer(tags, learning_rates=None):
    """创建模型和优化器"""
    if learning_rates is None:
        learning_rates = {'bert': 1e-5, 'classifier': 1e-3}
    
    # 创建标签映射
    id2lbl = {i: tag for i, tag in enumerate(tags)}
    lbl2id = {tag: i for i, tag in enumerate(tags)}
    
    # 加载模型
    model = AutoModelForTokenClassification.from_pretrained(
        'google-bert/bert-base-chinese',
        num_labels=len(tags),
        id2label=id2lbl,
        label2id=lbl2id
    )
    
    # 参数分组
    param_optimizer = list(model.named_parameters())
    bert_params = [params for name, params in param_optimizer if 'bert' in name]
    classifier_params = [params for name, params in param_optimizer if 'bert' not in name]
    
    param_groups = [
        {'params': bert_params, 'lr': learning_rates['bert']},
        {'params': classifier_params, 'weight_decay': 0.1, 'lr': learning_rates['classifier']}
    ]
    
    optimizer = optim.AdamW(param_groups)
    return model, optimizer

In [None]:
def train_manual(model, optimizer, train_dl, epochs, use_amp=False):
    """训练循环"""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    # 学习率调度器
    train_steps = len(train_dl) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=100,
        num_training_steps=train_steps
    )
    
    # 混合精度训练
    scaler = torch.cuda.amp.GradScaler() if use_amp else None
    
    for epoch in range(epochs):
        model.train()
        pbar = tqdm(train_dl, desc=f'Epoch {epoch+1}/{epochs}')
        
        for items in pbar:
            items = {k: v.to(device) for k, v in items.items()}
            optimizer.zero_grad()
            
            if use_amp:
                with torch.cuda.amp.autocast():
                    outputs = model(**items)
                loss = outputs.loss
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(**items)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
            
            scheduler.step()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    save_model(model, tags, "ner_manual_model")

In [None]:
def save_model(model, tags, save_dir):
    """保存模型和配置"""
    os.makedirs(save_dir, exist_ok=True)
    
    # 保存模型权重
    torch.save(model.state_dict(), os.path.join(save_dir, "model_weights.pth"))
    
    # 保存标签映射
    label_config = {
        'tags': tags,
        'id2label': {i: tag for i, tag in enumerate(tags)},
        'label2id': {tag: i for i, tag in enumerate(tags)}
    }
    with open(os.path.join(save_dir, "label_config.pkl"), "wb") as f:
        pickle.dump(label_config, f)
    
    print(f"模型已保存到: {save_dir}")





In [None]:
def load_model(save_dir):
    """加载保存的模型"""
    # 加载标签配置
    with open(os.path.join(save_dir, "label_config.pkl"), "rb") as f:
        label_config = pickle.load(f)
    
    tags = label_config['tags']
    id2label = label_config['id2label']
    label2id = label_config['label2id']
    
    # 创建模型
    model = AutoModelForTokenClassification.from_pretrained(
        'google-bert/bert-base-chinese',
        num_labels=len(tags),
        id2label=id2label,
        label2id=label2id
    )
    
    # 加载权重
    model.load_state_dict(torch.load(os.path.join(save_dir, "model_weights.pth")))
    
    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
    
    return model, tokenizer, tags



In [None]:
def predict_entities(model, tokenizer, tags, text):
    """推理函数"""
    model.eval()
    device = next(model.parameters()).device
    
    # 处理输入文本
    chars = list(text)
    inputs = tokenizer(
        chars,
        return_tensors='pt',
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=512
    )
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 预测
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    # 解析结果
    predicted_labels = [tags[pred.item()] for pred in predictions[0]]
    
    # 提取实体
    entities = []
    current_entity = None
    
    for i, (char, label) in enumerate(zip(chars, predicted_labels)):
        if label.startswith('B-'):
            if current_entity:
                entities.append(current_entity)
            current_entity = {
                'text': char,
                'label': label[2:],
                'start': i,
                'end': i + 1
            }
        elif label.startswith('I-') and current_entity and label[2:] == current_entity['label']:
            current_entity['text'] += char
            current_entity['end'] = i + 1
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = None
    
    if current_entity:
        entities.append(current_entity)
    
    return entities

In [None]:
def compute_metrics(eval_pred):
    """计算评估指标"""
    seqeval = evaluate.load('seqeval')
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # 获取标签映射
    _, tags, _ = load_and_prepare_data()
    
    # 准备评估数据
    true_predictions = [
        [tags[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return seqeval.compute(predictions=true_predictions, references=true_labels)

In [None]:
def inference_demo():
    """推理演示"""
    try:
        model, tokenizer, tags = load_model("ner_manual_model")
        model.eval()
        
        # 测试文本
        test_texts = [
            "李四在北京大学学习人工智能",
            "张三在腾讯公司工作",
            "王二喜欢看动漫《剑来》"
        ]
        
        for text in test_texts:
            print(f"\n输入文本: {text}")
            entities = predict_entities(model, tokenizer, tags, text)
            print("识别的实体:")
            for entity in entities:
                print(f"  - {entity['text']} ({entity['label']}) [{entity['start']}:{entity['end']}]")
    
    except FileNotFoundError:
        print("未找到保存的模型，请先进行训练！")

In [None]:
ds, tags, entity_index = load_and_prepare_data()
ds1 = process_entity_tags(ds, entity_index)
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
ds2 = tokenize_data(ds1, tokenizer)
model, optimizer = create_model_and_optimizer(tags)

In [None]:
# 训练
ds2.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
train_dl = DataLoader(ds2['train'], shuffle=True, batch_size=16)
train_manual(model, optimizer, train_dl, epochs=1, use_amp=True)


In [None]:
# 模型推理演示
inference_demo()