<a href="https://colab.research.google.com/github/whoc666/WNUT17-NER-Transformers/blob/main/WNUT17_NER_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 步骤 1: 设置Google Colab环境

# 安装必要的库，避免依赖冲突
!pip install transformers datasets evaluate seqeval torch --no-deps

# 安装缺失的核心依赖
!pip install requests filelock huggingface-hub xxhash dill==0.3.8 multiprocess==0.70.16

# 导入所需的模块
import torch  # PyTorch框架，用于模型训练
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments  # Hugging Face的核心工具
from datasets import load_dataset  # 用于加载数据集
import evaluate  # 用于评估指标
import numpy as np  # 用于数值计算

# 检查GPU是否可用
print(f"GPU是否可用: {torch.cuda.is_available()}")

# 验证所有库是否导入成功
try:
    import transformers
    import datasets
    import evaluate
    import seqeval
    import torch
    import xxhash
    print("所有库导入成功！")
except ImportError as e:
    print(f"导入失败: {e}")

GPU是否可用: True
所有库导入成功！


In [2]:
# 步骤 2: 加载WNUT-17数据集

# 导入数据集加载模块（已在步骤 1 中导入）
from datasets import load_dataset

# 加载WNUT-17数据集
dataset = load_dataset("wnut_17")

# 查看数据集结构
print("数据集结构:", dataset)

# 提取NER标签列表
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
print("NER标签列表:", label_list)
print("标签数量:", num_labels)

# 查看一个训练集样本
print("训练集样本示例:", dataset["train"][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


数据集结构: DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})
NER标签列表: ['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product']
标签数量: 13
训练集样本示例: {'id': '0', 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]}


In [3]:
# 步骤 3: 数据预处理

# 导入分词器和模型（已在步骤 1 中导入）
from transformers import AutoTokenizer
from datasets import load_dataset  # 已加载数据集

# 确保数据集已加载（从步骤 2）
dataset = load_dataset("wnut_17")

# 加载DistilBERT分词器
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("分词器加载完成:", model_name)

# 定义分词和标签对齐函数
def tokenize_and_align_labels(examples):
    # 对tokens进行分词
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # 截断超长序列
        is_split_into_words=True,  # 输入是单词列表
        padding=True,  # 填充到统一长度
        return_tensors="pt"  # 返回PyTorch张量
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        # 获取每个token对应的word索引
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        # 对每个token分配标签
        for word_idx in word_ids:
            if word_idx is None:
                # 特殊token（如[CLS], [SEP], [PAD]）标记为-100
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # 单词的第一个子词使用原始标签
                label_ids.append(label[word_idx])
            else:
                # 单词的后续子词标记为-100（模型忽略）
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    # 将对齐的标签添加到输入
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 对数据集应用分词和标签对齐
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
print("预处理后的数据集结构:", tokenized_datasets)

# 查看一个预处理后的训练集样本
print("预处理后的训练集样本:", tokenized_datasets["train"][0])

分词器加载完成: distilbert-base-uncased


Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

预处理后的数据集结构: DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1287
    })
})
预处理后的训练集样本: {'id': '0', 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 1030, 2703, 17122, 2009, 1005, 1055, 1996, 3193, 2013, 2073, 1045, 1005, 1049, 2542, 2005, 2048, 3134, 1012, 3400, 2110, 2311, 1027, 9686, 2497, 1012, 3492, 2919, 4040, 2182, 2197, 3944

In [4]:
# 步骤 4: 选择和配置模型

# 导入模型模块（已在步骤 1 中导入）
from transformers import AutoModelForTokenClassification

# 使用步骤 2 中获取的标签数量
# 假设 label_list 已从步骤 2 定义，例如 len(label_list) = 13
# 如果未定义，可以重新运行以下代码确认：
from datasets import load_dataset
dataset = load_dataset("wnut_17")
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
print("标签数量:", num_labels)

# 加载DistilBERT的NER模型
model_name = "distilbert-base-uncased"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels  # 设置输出层为标签数量
)
print("模型加载完成:", model_name)

# 查看模型配置
print("模型配置:", model.config)

标签数量: 13


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


模型加载完成: distilbert-base-uncased
模型配置: DistilBertConfig {
  "_attn_implementation_autoset": true,
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2

In [5]:
# 步骤 5: 定义评估指标

# 导入评估模块（已在步骤 1 中导入）
import evaluate
import numpy as np

# 使用步骤 2 中获取的标签列表
# 假设 label_list 已从步骤 2 定义
# 如果未定义，可以重新运行以下代码确认：
from datasets import load_dataset
dataset = load_dataset("wnut_17")
label_list = dataset["train"].features["ner_tags"].feature.names
print("NER标签列表:", label_list)

# 加载seqeval评估模块
metric = evaluate.load("seqeval")
print("seqeval评估模块加载完成")

# 定义评估函数
def compute_metrics(eval_pred):
    # 解包模型输出：logits（预测分数）和labels（真实标签）
    logits, labels = eval_pred
    # 将logits转换为预测标签（取最大概率的标签）
    predictions = np.argmax(logits, axis=-1)

    # 提取真实标签和预测标签，忽略-100（特殊token或子词）
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # 使用seqeval计算指标
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # 返回总体指标
    return {
        "精确率": results["overall_precision"],
        "召回率": results["overall_recall"],
        "F1分数": results["overall_f1"],
        "准确率": results["overall_accuracy"],
    }

# 测试评估函数（模拟数据）
# 构造一个简单的模拟输入
sample_logits = np.array([[[0.1, 0.8, 0.1], [0.9, 0.05, 0.05]]])  # 模拟2个token的预测
sample_labels = np.array([[1, -100]])  # 模拟真实标签（1个有效标签，1个忽略）
sample_metrics = compute_metrics((sample_logits, sample_labels))
print("模拟评估结果:", sample_metrics)

NER标签列表: ['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product']
seqeval评估模块加载完成
模拟评估结果: {'精确率': np.float64(1.0), '召回率': np.float64(1.0), 'F1分数': np.float64(1.0), '准确率': 1.0}


In [6]:
# 步骤 6: 设置训练参数

# 导入训练参数模块（已在步骤 1 中导入）
from transformers import TrainingArguments

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",  # 模型和检查点的保存目录
    eval_strategy="epoch",  # 每轮训练后评估模型（更新为新参数名）
    learning_rate=2e-5,  # 学习率，控制权重更新速度
    per_device_train_batch_size=16,  # 训练时每个设备的批量大小
    per_device_eval_batch_size=16,  # 评估时每个设备的批量大小
    num_train_epochs=3,  # 训练轮数
    weight_decay=0.01,  # 权重衰减，防止过拟合
    save_strategy="epoch",  # 每轮保存一次模型
    load_best_model_at_end=True,  # 训练结束后加载最佳模型
)

# 查看训练参数
print("训练参数配置:")
for key, value in training_args.__dict__.items():
    if not key.startswith("_"):  # 忽略内部属性
        print(f"{key}: {value}")

训练参数配置:
output_dir: ./results
overwrite_output_dir: False
do_train: False
do_eval: True
do_predict: False
eval_strategy: IntervalStrategy.EPOCH
prediction_loss_only: False
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
eval_delay: 0
torch_empty_cache_steps: None
learning_rate: 2e-05
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: -1
lr_scheduler_type: SchedulerType.LINEAR
lr_scheduler_kwargs: {}
warmup_ratio: 0.0
warmup_steps: 0
log_level: passive
log_on_each_node: True
logging_dir: ./results/runs/Apr24_13-14-34_f8533ce58998
logging_strategy: IntervalStrategy.STEPS
logging_first_step: False
logging_steps: 500
logging_nan_inf_filter: True
save_strategy: SaveStrategy.EPOCH
save_steps: 500
save_total_limit: None
save_safetensors: True
save_on_each_node: False
save_only_model: Fal

In [7]:
# 步骤 7: 训练模型

# 导入必要模块
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
import evaluate
import numpy as np

# 加载数据集（步骤 2）
dataset = load_dataset("wnut_17")

# 加载分词器和模型（步骤 3 和 4）
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(dataset["train"].features["ner_tags"].feature.names)
)

# 定义分词和标签对齐函数（步骤 3，修复长度问题）
def tokenize_and_align_labels(examples):
    # 分词，统一序列长度
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # 截断超长序列
        is_split_into_words=True,  # 输入是单词列表
        padding="max_length",  # 填充到最大长度（默认512）
        max_length=128,  # 设置最大长度，WNUT-17样本较短
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 特殊token（如[CLS], [SEP]）
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # 单词的第一个子词
            else:
                label_ids.append(-100)  # 后续子词
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 预处理数据集
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# 定义评估函数（步骤 5）
metric = evaluate.load("seqeval")
label_list = dataset["train"].features["ner_tags"].feature.names

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "精确率": results["overall_precision"],
        "召回率": results["overall_recall"],
        "F1分数": results["overall_f1"],
        "准确率": results["overall_accuracy"],
    }

# 定义训练参数（步骤 6，禁用wandb）
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",  # 禁用wandb等日志集成
)

# 初始化Trainer（使用processing_class）
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,  # 替换tokenizer参数
    compute_metrics=compute_metrics,
)

# 开始训练
print("开始训练模型...")
trainer.train()

# 查看训练结果
print("训练完成！")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

开始训练模型...


Epoch,Training Loss,Validation Loss,精确率,召回率,F1分数,准确率
1,No log,0.236669,0.667946,0.416268,0.512896,0.945515
2,No log,0.230063,0.66835,0.47488,0.555245,0.950919
3,0.182000,0.226733,0.677368,0.504785,0.578478,0.953652


训练完成！


In [8]:
# 步骤 8: 评估模型

# 导入必要模块
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
import evaluate
import numpy as np

# 加载数据集（步骤 2）
dataset = load_dataset("wnut_17")

# 加载分词器（步骤 3）
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 从最佳检查点加载模型
checkpoint_path = "./results/checkpoint-639"  # 第3轮的检查点（F1最高）
model = AutoModelForTokenClassification.from_pretrained(checkpoint_path)

# 定义分词和标签对齐函数（步骤 3）
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 预处理数据集（包括测试集）
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# 定义评估函数（步骤 5）
metric = evaluate.load("seqeval")
label_list = dataset["train"].features["ner_tags"].feature.names

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "精确率": results["overall_precision"],
        "召回率": results["overall_recall"],
        "F1分数": results["overall_f1"],
        "准确率": results["overall_accuracy"],
    }

# 定义训练参数（仅用于评估）
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,  # 评估批量大小
    report_to="none",
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets["test"],  # 使用测试集
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# 在测试集上评估
print("开始评估模型（测试集）...")
eval_results = trainer.evaluate()

# 输出评估结果
print("测试集评估结果:")
for key, value in eval_results.items():
    if key.startswith("eval_"):
        print(f"{key.replace('eval_', '')}: {value}")

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

开始评估模型（测试集）...


测试集评估结果:
loss: 0.2602807581424713
model_preparation_time: 0.0016
精确率: 0.5650929899856938
召回率: 0.36675951717734445
F1分数: 0.4448198198198198
准确率: 0.9453790505543427
runtime: 5.5079
samples_per_second: 233.665
steps_per_second: 14.706


In [9]:
# 安装huggingface_hub库
!pip install huggingface_hub



In [10]:
!pip install huggingface_hub
from huggingface_hub import login
login()  # 输入你的Token
from transformers import AutoModelForTokenClassification, AutoTokenizer
model = AutoModelForTokenClassification.from_pretrained("./results/checkpoint-639")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model.push_to_hub("whoc666/wnut17-ner")  # 替换为你的用户名
tokenizer.push_to_hub("whoc666/wnut17-ner")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/whoc666/wnut17-ner/commit/adf00cd3c97c16e91c05147f31bd9817be80e375', commit_message='Upload tokenizer', commit_description='', oid='adf00cd3c97c16e91c05147f31bd9817be80e375', pr_url=None, repo_url=RepoUrl('https://huggingface.co/whoc666/wnut17-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='whoc666/wnut17-ner'), pr_revision=None, pr_num=None)

In [13]:
# 步骤：上传你的模型到Hugging Face Hub

# 安装huggingface_hub库
!pip install huggingface_hub

# 登录Hugging Face
from huggingface_hub import login
login()  # 输入你的Token

# 加载你的模型和分词器
from transformers import AutoModelForTokenClassification, AutoTokenizer
model = AutoModelForTokenClassification.from_pretrained("./results/checkpoint-639")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# 上传到Hugging Face Hub
model.push_to_hub("whoc666/wnut17-ner")  # 替换为你的用户名
tokenizer.push_to_hub("whoc666/wnut17-ner")

print("模型已上传到Hugging Face Hub！路径为：whoc666/wnut17-ner")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


模型已上传到Hugging Face Hub！路径为：whoc666/wnut17-ner


In [3]:
html_content = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>NER Web Interface</title>
    <!-- 样式部分：定义界面的外观 -->
    <style>
        body {
            font-family: Arial, sans-serif; /* 使用Arial字体 */
            max-width: 800px; /* 限制页面宽度 */
            margin: 0 auto; /* 居中显示 */
            padding: 20px; /* 内边距 */
            background-color: #f4f4f9; /* 背景颜色 */
        }
        h1 {
            text-align: center; /* 标题居中 */
            color: #333; /* 标题颜色 */
        }
        .input-container {
            margin-bottom: 20px; /* 输入区域下方的间距 */
        }
        textarea {
            width: 100%; /* 文本框宽度 */
            height: 100px; /* 文本框高度 */
            padding: 10px; /* 内边距 */
            font-size: 16px; /* 字体大小 */
            border: 1px solid #ccc; /* 边框样式 */
            border-radius: 4px; /* 圆角边框 */
            resize: vertical; /* 允许垂直调整大小 */
        }
        button {
            display: block; /* 按钮独占一行 */
            width: 100%; /* 按钮宽度 */
            padding: 10px; /* 内边距 */
            background-color: #007bff; /* 背景颜色 */
            color: white; /* 字体颜色 */
            border: none; /* 无边框 */
            border-radius: 4px; /* 圆角边框 */
            font-size: 16px; /* 字体大小 */
            cursor: pointer; /* 鼠标悬停时显示手型 */
        }
        button:hover {
            background-color: #0056b3; /* 鼠标悬停时的背景颜色 */
        }
        #results {
            margin-top: 20px; /* 结果区域上方的间距 */
            padding: 15px; /* 内边距 */
            background-color: white; /* 背景颜色 */
            border: 1px solid #ddd; /* 边框样式 */
            border-radius: 4px; /* 圆角边框 */
        }
        .entity {
            padding: 2px 5px; /* 实体标签的内边距 */
            margin: 0 2px; /* 外边距 */
            border-radius: 3px; /* 圆角边框 */
            font-weight: bold; /* 字体加粗 */
        }
        /* 为WNUT-17标签设置颜色 */
        .B-person { background-color: #ffcccb; } /* 人名 */
        .I-person { background-color: #ffcccb; }
        .B-location { background-color: #90ee90; } /* 地点 */
        .I-location { background-color: #90ee90; }
        .B-organization { background-color: #add8e6; } /* 组织 */
        .I-organization { background-color: #add8e6; }
        .B-corporation { background-color: #add8e6; }
        .I-corporation { background-color: #add8e6; }
        .B-group { background-color: #add8e6; }
        .I-group { background-color: #add8e6; }
        .B-product { background-color: #ffe4b5; } /* 产品 */
        .I-product { background-color: #ffe4b5; }
        .B-creative-work { background-color: #ffe4b5; } /* 创意作品 */
        .I-creative-work { background-color: #ffe4b5; }
    </style>
</head>
<body>
    <!-- 页面标题 -->
    <h1>Named Entity Recognition (NER) Interface</h1>
    <!-- 输入区域：文本框和按钮 -->
    <div class="input-container">
        <textarea id="inputText" placeholder="Enter text to analyze..."></textarea>
        <button onclick="predictEntities()">Predict Entities</button>
    </div>
    <!-- 结果显示区域 -->
    <div id="results"></div>

    <!-- 加载transformers.js -->
    <script src="https://cdn.jsdelivr.net/npm/@huggingface/transformers@2.7.0/dist/transformers.min.js"></script>
    <script>
        let pipeline = null; // 用于存储NER pipeline

        // 检查网络连接
        async function checkNetwork() {
            try {
                await fetch("https://huggingface.co/whoc666/wnut17-ner");
                console.log("Network check: Able to reach Hugging Face model repository.");
                return true;
            } catch (error) {
                console.error("Network check failed:", error);
                throw new Error("Cannot connect to Hugging Face model repository. Please check your network.");
            }
        }

        // 初始化NER pipeline（加载模型）
        async function initializePipeline() {
            if (!pipeline) {
                console.log("Loading NER model..."); // 打印加载提示
                try {
                    // 检查网络连接
                    await checkNetwork();
                    // 使用transformers.js加载你的模型
                    pipeline = await transformers.pipeline("token-classification", {
                        model: "whoc666/wnut17-ner", // 使用你的模型路径
                        use_gpu: false // 禁用GPU
                    });
                    console.log("Model loaded successfully!"); // 加载成功提示
                } catch (error) {
                    console.error("Failed to load model:", error);
                    throw new Error("Model loading failed: " + error.message);
                }
            }
            return pipeline;
        }

        // 预测实体
        async function predictEntities() {
            // 获取用户输入的文本
            const inputText = document.getElementById("inputText").value.trim();
            const resultsDiv = document.getElementById("results");

            // 检查输入是否为空
            if (!inputText) {
                resultsDiv.innerHTML = "<p style='color: red;'>Please enter some text!</p>";
                return;
            }

            // 显示加载提示
            resultsDiv.innerHTML = "<p>Loading...</p>";

            try {
                // 确保pipeline已加载
                const nerPipeline = await initializePipeline();

                // 进行NER预测
                console.log("Predicting entities for text:", inputText);
                const entities = await nerPipeline(inputText, { aggregation_strategy: "simple" });
                console.log("Entities predicted:", entities);

                // 如果没有识别到实体
                if (entities.length === 0) {
                    resultsDiv.innerHTML = "<p>No entities found.</p>";
                    return;
                }

                // 处理预测结果，显示高亮的实体
                let outputHtml = "";
                let currentIndex = 0;

                for (const entity of entities) {
                    const entityStart = entity.start;
                    const entityEnd = entity.end;
                    const entityType = entity.entity_group;

                    // 添加实体前的文本
                    if (currentIndex < entityStart) {
                        const beforeText = inputText.slice(currentIndex, entityStart);
                        outputHtml += beforeText.replace(/\s+/g, " ");
                    }

                    // 添加实体（高亮显示）
                    const entityText = inputText.slice(entityStart, entityEnd);
                    outputHtml += `<span class="entity ${entityType}">${entityText} (${entityType})</span>`;
                    currentIndex = entityEnd;
                }

                // 添加剩余文本
                if (currentIndex < inputText.length) {
                    outputHtml += inputText.slice(currentIndex).replace(/\s+/g, " ");
                }

                // 显示结果
                resultsDiv.innerHTML = outputHtml;
            } catch (error) {
                // 捕获错误并显示详细错误信息
                console.error("Prediction error:", error);
                resultsDiv.innerHTML = `<p style='color: red;'>Error during prediction: ${error.message}. Please check your network connection or try a different browser.</p>`;
            }
        }
    </script>
</body>
</html>
'''

# 将内容写入index.html文件
with open("index.html", "w") as f:
    f.write(html_content)

# 下载index.html到本地
from google.colab import files
files.download("index.html")
print("index.html 文件已生成并下载！请上传到GitHub仓库。")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

index.html 文件已生成并下载！请上传到GitHub仓库。
