In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
import json
import os
from tqdm import tqdm

# 固定随机种子
SEED = 2023
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

class NewsDataset(Dataset):
    """新闻标题分类数据集"""
    def __init__(self, data_path, label2id, tokenizer, max_len=128):
        self.data = pd.read_csv(data_path, sep='\t')
        self.label2id = label2id
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        # 清洗数据（新增中文空字符处理）
        self.data = self.data.dropna(subset=['title', 'label'])
        self.data = self.data[self.data['title'].str.len() > 0]
        self.data['title'] = self.data['title'].str.replace(r'\s+', ' ', regex=True)  # 处理非常规空格

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = str(self.data.iloc[idx]['title']).strip()
        label = self.label2id.get(str(self.data.iloc[idx]['label']).strip(), -1)
        
        encoding = self.tokenizer.encode_plus(
            title,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation='longest_first',  # 更安全的中文截断策略
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class NewsClassifier:
    def __init__(self, config_path):
        self.config = self.load_config(config_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # 新增训练过程记录器
        self.step_records = {  # 详细步骤记录
            'epoch': [],
            'global_step': [],
            'batch_loss': []
        }
        self.epoch_records = {  # 阶段汇总记录
            'epoch': [],
            'avg_train_loss': [],
            'val_loss': [],
            'val_acc': []
        }
        self.global_step = 0
        
        # 加载标签映射（增加编码校验）
        label_path = os.path.join(self.config['data_dir'], 'label2id.json')
        with open(label_path, 'r', encoding='utf-8') as f:
            self.label2id = json.load(f)
        self.id2label = {v: k for k, v in self.label2id.items()}
        
        # 初始化模型和tokenizer（本地路径加载）
        model_dir = self.config['bert_path']
        
        # 检查必要文件是否存在
        required_files = ['config.json', 'pytorch_model.bin', 'vocab.txt', 'tokenizer_config.json']
        for f in required_files:
            if not os.path.exists(os.path.join(model_dir, f)):
                raise FileNotFoundError(f"缺失必要文件: {os.path.join(model_dir, f)}")

        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(
            model_dir,
            num_labels=len(self.label2id),
            id2label=self.id2label,
            label2id=self.label2id,
            ignore_mismatched_sizes=True  # 新增关键参数
        ).to(self.device)

    @staticmethod
    def load_config(config_path):
        """加载配置文件（增加校验）"""
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        required_keys = ['bert_path', 'data_dir', 'model_dir', 'max_len']
        for key in required_keys:
            if key not in config:
                raise ValueError(f"配置文件中缺失必要字段: {key}")
        return config

    def train(self, train_data, valid_data, epochs=3, batch_size=32):
        """训练模型（增加梯度累积）"""
        # 准备数据（路径校验）
        train_path = os.path.join(self.config['data_dir'], train_data)
        valid_path = os.path.join(self.config['data_dir'], valid_data)
        if not os.path.exists(train_path):
            raise FileNotFoundError(f"训练数据不存在: {train_path}")
        if not os.path.exists(valid_path):
            raise FileNotFoundError(f"验证数据不存在: {valid_path}")

        train_dataset = NewsDataset(train_path, self.label2id, self.tokenizer)
        valid_dataset = NewsDataset(valid_path, self.label2id, self.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, pin_memory=True)
        
        # 使用PyTorch官方的AdamW
        optimizer = AdamW(self.model.parameters(), 
                        lr=2e-5,
                        weight_decay=0.01)
        
        best_acc = 0.0
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0.0
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
            
            for step, batch in enumerate(progress_bar):
                inputs = {
                    'input_ids': batch['input_ids'].to(self.device, non_blocking=True),
                    'attention_mask': batch['attention_mask'].to(self.device, non_blocking=True),
                    'labels': batch['label'].to(self.device, non_blocking=True)
                }
                
                outputs = self.model(**inputs)
                loss = outputs.loss
                loss.backward()

                # 步数计数器递增
                self.global_step += 1

                # 每150步记录
                if self.global_step % 150 == 0:
                    self.step_records['epoch'].append(epoch + 1)
                    self.step_records['global_step'].append(self.global_step)
                    self.step_records['batch_loss'].append(loss.item())
                
                # 梯度累积
                accumulation_steps = 4
                if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
                    optimizer.step()
                    optimizer.zero_grad()
                
                total_loss += loss.item()
                progress_bar.set_postfix({'loss': loss.item()})
            
            # 验证阶段
            val_acc, val_loss = self.evaluate(valid_loader)

            self.epoch_records['epoch'].append(epoch + 1)
            self.epoch_records['avg_train_loss'].append(total_loss/len(train_loader))
            self.epoch_records['val_loss'].append(val_loss)
            self.epoch_records['val_acc'].append(val_acc)
            
            
            print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
            # 保存最佳模型
            if val_acc > best_acc:
                best_acc = val_acc
                save_path = os.path.join(self.config['model_dir'], 'best_model.pth')
                self.model.save_pretrained(self.config['model_dir'])
                self.tokenizer.save_pretrained(self.config['model_dir'])
                print(f"Saved best model to {save_path}")
        
        # 训练结束后自动保存记录
        self._save_training_records()


    def _save_training_records(self):
        """保存训练记录到Excel"""
        with pd.ExcelWriter(os.path.join(self.config['model_dir'], 'training_records.xlsx')) as writer:
            # 步骤级记录
            pd.DataFrame(self.step_records).to_excel(
                writer, 
                sheet_name='Step Records',
                index=False
            )
            # 阶段级记录
            pd.DataFrame(self.epoch_records).to_excel(
                writer,
                sheet_name='Epoch Summary',
                index=False
            )


    def evaluate(self, data_loader):
        """评估模型（增加类别平衡校验）"""
        self.model.eval()
        total_loss = 0.0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Evaluating"):
                inputs = {
                    'input_ids': batch['input_ids'].to(self.device),
                    'attention_mask': batch['attention_mask'].to(self.device),
                    'labels': batch['label'].to(self.device)
                }
                
                outputs = self.model(**inputs)
                loss = outputs.loss
                total_loss += loss.item()
                
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(inputs['labels'].cpu().numpy())
        
        return accuracy_score(all_labels, all_preds), total_loss/len(data_loader)

    def predict(self, text, confidence_threshold=0.6):
        """预测单条数据（增加置信度阈值）"""
        self.model.eval()
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.config['max_len'],
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**encoding)
        
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        max_prob, pred_idx = torch.max(probs, dim=1)
        
        if max_prob.item() < confidence_threshold:
            return "Unknown", max_prob.item()
        return self.id2label[pred_idx.item()], max_prob.item()

if __name__ == "__main__":
    # 示例配置文件 config.json
    # {
    #     "bert_path": "./model/bert-base-chinese",
    #     "data_dir": "./data",
    #     "model_dir": "./saved_models",
    #     "max_len": 128
    # }
    
    classifier = NewsClassifier("config.json")
    
    # 训练流程
    classifier.train(
        train_data="train.csv",
        valid_data="valid.csv",
        epochs=3,
        batch_size=32
    )
    
    # 预测示例（增加低置信度样本）
    test_samples = [
        "央行最新货币政策调整方案",
        "欧冠决赛皇马夺冠",
        "未命名测试文本12345",  # 低置信度样本
        "AI技术突破量子计算瓶颈"
    ]
    
    for text in test_samples:
        label, confidence = classifier.predict(text)
        status = "高置信度" if confidence > 0.6 else "低置信度"
        print(f"输入文本: {text}")
        print(f"预测结果: {label} ({status}, 置信度: {confidence:.2%})")
        print("-" * 60)

2025-03-24 13:36:48.067034: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 13:36:48.106746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./model/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 18279/18279 [1:06:31<00:00,  4.58it/s, loss=0.1

Epoch 1 | Train Loss: 0.2349 | Val Loss: 0.1658 | Val Acc: 0.9471
[2025-03-24 14:48:56,478] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /root/.triton/autotune: 没有那个文件或目录


Saved best model to ./saved_models/best_model.pth


Epoch 2: 100%|██████████| 18279/18279 [1:06:51<00:00,  4.56it/s, loss=0.226]  
Evaluating: 100%|██████████| 3917/3917 [05:11<00:00, 12.58it/s]


Epoch 2 | Train Loss: 0.1389 | Val Loss: 0.1522 | Val Acc: 0.9512
Saved best model to ./saved_models/best_model.pth


Epoch 3: 100%|██████████| 18279/18279 [1:06:49<00:00,  4.56it/s, loss=0.0949] 
Evaluating: 100%|██████████| 3917/3917 [05:11<00:00, 12.57it/s]


Epoch 3 | Train Loss: 0.1040 | Val Loss: 0.1604 | Val Acc: 0.9502
输入文本: 央行最新货币政策调整方案
预测结果: Unknown (低置信度, 置信度: 37.41%)
------------------------------------------------------------
输入文本: 欧冠决赛皇马夺冠
预测结果: 体育 (高置信度, 置信度: 81.44%)
------------------------------------------------------------
输入文本: 未命名测试文本12345
预测结果: 科技 (高置信度, 置信度: 99.05%)
------------------------------------------------------------
输入文本: AI技术突破量子计算瓶颈
预测结果: 科技 (高置信度, 置信度: 99.59%)
------------------------------------------------------------


In [2]:
# %% [markdown]
# ## 新闻分类预测模块
# 加载训练好的模型进行预测

# %%
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json
import os
import pandas as pd

# 固定随机种子
SEED = 2023
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# %% [markdown]
# ### 配置参数设置
# %%
# 加载配置文件
config_path = "config.json"
with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

# 路径配置校验
required_paths = ['bert_path', 'data_dir', 'model_dir']
for path_key in required_paths:
    if not os.path.exists(config[path_key]):
        raise FileNotFoundError(f"路径不存在: {config[path_key]}")

# 加载标签映射
label_path = os.path.join(config['data_dir'], 'label2id.json')
with open(label_path, 'r', encoding='utf-8') as f:
    label2id = json.load(f)
id2label = {v: k for k, v in label2id.items()}

# %% [markdown]
# ### 模型加载模块
# %%
# 设备检测
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 加载分词器
tokenizer = BertTokenizer.from_pretrained(config['bert_path'])

# 加载训练好的模型
model = BertForSequenceClassification.from_pretrained(
    config['model_dir'],  # 指向保存的模型目录
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)
model.eval()
print("模型加载成功!")

# %% [markdown]
# ### 预测函数
# %%
def news_predict(text, confidence_threshold=0.6):
    """
    新闻分类预测函数
    参数:
        text (str): 输入文本
        confidence_threshold (float): 置信度阈值(0-1)
    返回:
        tuple: (预测标签, 置信度)
    """
    # 文本编码
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=config['max_len'],
        padding='max_length',
        truncation='longest_first',
        return_tensors='pt'
    ).to(device)
    
    # 模型推理
    with torch.no_grad():
        outputs = model(**encoding)
    
    # 计算概率
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    max_prob, pred_idx = torch.max(probs, dim=1)
    
    # 置信度判断
    if max_prob.item() < confidence_threshold:
        return "Unknown", max_prob.item()
    return id2label[pred_idx.item()], max_prob.item()

# %% [markdown]
# ### 预测演示
# %%
# 测试样本
test_samples = [
    "2018年LOL世界电子竞赛，IG战队成功夺冠！",
    "欧冠决赛皇马3-1击败利物浦夺冠",
    "人工智能芯片研发取得重大突破", 
    "沪深股市今日收盘行情分析"
]

# 执行预测
results = []
for text in test_samples:
    label, confidence = news_predict(text)
    status = "高置信度" if confidence > 0.6 else "低置信度"
    results.append({
        '输入文本': text,
        '预测标签': label,
        '置信度': f"{confidence:.2%}",
        '状态': status
    })

# 显示结果
df = pd.DataFrame(results)
display(df.style.set_caption("分类预测结果").set_table_styles([
    {'selector': 'caption', 
     'props': [('font-size', '16px'), ('color', 'blue')]}
]))

使用设备: cpu
模型加载成功!


Unnamed: 0,输入文本,预测标签,置信度,状态
0,2018年LOL世界电子竞赛，IG战队成功夺冠！,游戏,93.35%,高置信度
1,欧冠决赛皇马3-1击败利物浦夺冠,体育,94.76%,高置信度
2,人工智能芯片研发取得重大突破,科技,97.57%,高置信度
3,沪深股市今日收盘行情分析,股票,99.92%,高置信度
4,德国总统施泰因迈尔解散现任政府 当地时间3月25日，德国联邦总统施泰因迈尔向现任德国总理朔尔茨及其内阁成员移交解职证明，朔尔茨及其内阁成员将继续履职直至新一届联邦政府成立。,时政,98.54%,高置信度
5,国台办介绍台湾“八旗文化”总编辑李延贺案有关情况 中国台湾网3月26日讯 国务院台湾事务办公室今天上午举行例行新闻发布会，国台办发言人陈斌华主持本次新闻发布会。日本广播协会记者：据报道，上个月，上海中级人民法院对台湾“八旗文化”总编辑李延贺案作出公开宣判。请介绍判决的详细内容。陈斌华：上海市第一中级人民法院于2025年2月17日一审公开宣判李延贺案，依法依规，公开公正。宣判后，当事人当庭表示服法认罪，在法定上诉期内未提起上诉，相关判决已依法生效。至于刑期，当事人和家属是非常清楚的。作为发言人，我本不愿公开谈论具体细节，民进党当局一再炒作司法个案，诬蔑抹黑大陆司法体制，完全是别有用心的。据我了解，上海市第一中级人民法院于2025年2月17日一审公开宣判，以煽动分裂国家罪判处李延贺有期徒刑三年，剥夺政治权利一年，并处没收个人财产人民币五万元。刑罚执行期间，有关方面将依法保障当事人及家属各项权益。（编辑/李宁）,时政,97.25%,高置信度


In [1]:
# %% [markdown]
# ## 模型性能评估模块
# 计算每个类别的精确率、召回率和F1-score

# %%
import torch
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
import json
import os
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from tqdm import tqdm

# 固定随机种子
SEED = 2023
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# %% [markdown]
# ### 加载配置和模型
# %%
# 加载配置文件
with open("config.json", 'r') as f:
    config = json.load(f)

# 加载标签映射
label_path = os.path.join(config['data_dir'], 'label2id.json')
with open(label_path, 'r') as f:
    label2id = json.load(f)
id2label = {v:k for k,v in label2id.items()}

# 设备检测
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载模型
model = BertForSequenceClassification.from_pretrained(
    config['model_dir'],
    num_labels=len(label2id)
).to(device)
model.eval()

# 加载分词器
tokenizer = BertTokenizer.from_pretrained(config['bert_path'])

# %% [markdown]
# ### 评估函数实现
# %%
def evaluate_model(test_data_path, batch_size=32):
    """
    评估模型性能
    参数:
        test_data_path: 测试集路径
        batch_size: 批处理大小
    返回:
        classification_report: 分类报告
        confusion_matrix: 混淆矩阵
    """
    # 加载测试数据
    test_df = pd.read_csv(test_data_path, sep='\t')
    
    # 数据预处理
    test_df = test_df.dropna(subset=['title', 'label'])
    test_df = test_df[test_df['title'].str.len() > 0]
    test_df['title'] = test_df['title'].str.replace(r'\s+', ' ', regex=True)
    
    # 创建Dataset
    class TestDataset(Dataset):
        def __init__(self, df):
            self.df = df
            
        def __len__(self):
            return len(self.df)
        
        def __getitem__(self, idx):
            text = str(self.df.iloc[idx]['title']).strip()
            label = label2id[str(self.df.iloc[idx]['label']).strip()]
            return text, label
    
    dataset = TestDataset(test_df)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    # 预测结果收集
    all_labels = []
    all_preds = []
    
    with torch.no_grad():
        for texts, labels in tqdm(loader):
            # 文本编码
            encoding = tokenizer(
                texts,
                max_length=config['max_len'],
                padding='max_length',
                truncation='longest_first',
                return_tensors='pt'
            ).to(device)
            
            # 模型预测
            outputs = model(**encoding)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            
            all_labels.extend(labels.numpy())
            all_preds.extend(preds)
    
    # 转换为类别名称
    true_labels = [id2label[l] for l in all_labels]
    pred_labels = [id2label[p] for p in all_preds]
    
    # 生成分类报告
    report = classification_report(
        true_labels,
        pred_labels,
        target_names=label2id.keys(),
        output_dict=True
    )
    
    # 格式转换
    df_report = pd.DataFrame(report).transpose()
    df_confusion = pd.crosstab(
        pd.Series(true_labels, name='Actual'),
        pd.Series(pred_labels, name='Predicted'),
        margins=True
    )
    
    return df_report, df_confusion

# %% [markdown]
# ### 执行评估
# %%
# 使用示例
test_path = os.path.join(config['data_dir'], "test.csv")  # 假设测试集路径
report_df, confusion_df = evaluate_model(test_path)

# 打印结果
print("\n分类指标报告:")
display(report_df)

print("\n混淆矩阵:")
display(confusion_df.style.background_gradient(cmap='Blues'))

# 保存结果
report_df.to_excel(os.path.join(config['model_dir'], 'classification_report.xlsx'))
confusion_df.to_excel(os.path.join(config['model_dir'], 'confusion_matrix.xlsx'))
print("\n结果已保存至模型目录")

2025-03-24 21:23:25.399002: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-24 21:23:25.508215: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 3917/3917 [05:27<00:00, 11.96it/s]



分类指标报告:


Unnamed: 0,precision,recall,f1-score,support
体育,0.995943,0.983667,0.989767,19715.0
娱乐,0.959277,0.974145,0.966654,14001.0
家居,0.964423,0.929187,0.946477,4872.0
彩票,0.966263,0.968777,0.967518,1153.0
房产,0.909943,0.921356,0.915614,2950.0
教育,0.962476,0.969628,0.966039,6190.0
时尚,0.93731,0.929323,0.9333,1995.0
时政,0.901563,0.941622,0.921157,9250.0
星座,0.95993,0.973498,0.966667,566.0
游戏,0.934393,0.891125,0.912246,3628.0



混淆矩阵:


Predicted,体育,娱乐,家居,彩票,房产,教育,时尚,时政,星座,游戏,社会,科技,股票,财经,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
体育,19393,177,4,31,19,21,3,25,0,2,7,16,15,2,19715
娱乐,19,13639,11,1,16,32,63,48,0,23,38,78,29,4,14001
家居,1,39,4527,0,71,11,21,10,7,6,17,116,40,6,4872
彩票,25,1,1,1117,1,0,0,2,0,0,3,1,1,1,1153
房产,0,15,22,0,2718,6,0,30,0,1,24,19,96,19,2950
教育,1,15,1,0,3,6002,1,54,13,1,52,39,5,3,6190
时尚,0,60,24,0,8,10,1854,10,2,1,6,18,2,0,1995
时政,11,46,3,1,14,21,7,8710,0,2,147,155,115,18,9250
星座,0,2,3,0,0,2,4,0,551,0,0,1,1,2,566
游戏,3,25,5,0,2,8,8,19,1,3233,8,307,7,2,3628



结果已保存至模型目录
