In [4]:
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import boto3
import pandas as pd
import json
from sklearn.metrics import classification_report, confusion_matrix

# 1. 首先部署模型
def deploy_model():
    try:
        role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client('iam')
        role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
    
    # 使用你训练好的模型创建HuggingFaceModel
    huggingface_model = HuggingFaceModel(
        model_data="s3://sagemaker-ap-northeast-1-034362076319/huggingface-pytorch-training-2024-11-03-15-21-58-628/output/model.tar.gz",
        role=role,
        transformers_version="4.37.0",
        pytorch_version="2.1.0",
        py_version='py310',
        entry_point='inference.py'
    )
    
    # 部署模型
    predictor = huggingface_model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.xlarge',
        endpoint_name='cold-sentiment-endpoint'
    )
    
    return predictor

# 2. 评估函数
def evaluate_model():
    # 加载eval数据集
    eval_data = pd.read_json('../eval_processed.jsonl', lines=True)
    
    # 创建预测客户端
    runtime_client = boto3.client('runtime.sagemaker')
    
    predictions = []
    probabilities_list = []  # 改名以更准确反映内容
    
    print("开始评估...")
    total = len(eval_data)
    
    for idx, row in eval_data.iterrows():
        if idx % 100 == 0:  # 每100条数据显示进度
            print(f"进度: {idx}/{total}")
            
        # 准备输入数据
        input_data = {"text": row['text']}
        print(f"{row['text']}")
        try:
            # 调用endpoint进行预测
            response = runtime_client.invoke_endpoint(
                EndpointName='roberta-cold-endpoint',
                ContentType='application/json',
                Body=json.dumps(input_data)
            )
            
            # 解析结果
            result = json.loads(response['Body'].read().decode())
            print(f"预测结果 (index {idx}): {result}")
            if result['label'] == 'LABEL_0':
                
                predictions.append(0)
            else:
                predictions.append(1)

            # probabilities_list.append(result['probabilities'])  # 存储完整的概率分布
            
        except Exception as e:
            print(f"预测错误 (index {idx}): {str(e)}")
            predictions.append(None)
            probabilities_list.append(None)
    
    # 添加预测结果到数据框
    eval_data['predicted'] = predictions
    # eval_data['probabilities'] = probabilities_list  # 存储完整的概率分布
    # 计算最高概率作为置信度
    # eval_data['confidence'] = eval_data['probabilities'].apply(
    #     lambda x: max(x) if x is not None else None
    # )
    
    # 输出评估结果
    print("\n=== 评估结果 ===")
    print("\n分类报告:")
    print(classification_report(eval_data['label'], eval_data['predicted']))
    
    print("\n混淆矩阵:")
    cm = confusion_matrix(eval_data['label'], eval_data['predicted'])
    print(cm)
    
    # 计算额外统计信息
    eval_data['is_correct'] = eval_data['label'] == eval_data['predicted']
    print("\n模型性能统计:")
    print(f"总样本数: {len(eval_data)}")
    print(f"准确率: {eval_data['is_correct'].mean():.4f}")
    # print(f"平均置信度: {eval_data['confidence'].mean():.4f}")
    
    # 错误分析
    print("\n错误预测分析:")
    # errors = eval_data[~eval_data['is_correct']].sort_values('confidence', ascending=False)
    # print("\n高置信度错误案例:")
    # for _, row in errors.head().iterrows():
    #     print(f"\n文本: {row['text']}")
    #     print(f"真实标签: {row['label']}")
    #     print(f"预测标签: {row['predicted']}")
    #     print(f"置信度: {row['confidence']:.4f}")
    
    # 保存评估结果
    eval_data.to_csv('model_evaluation_results-cold-old.csv', index=False)
    return eval_data

# 3. 清理资源函数
def cleanup_endpoint():
    client = boto3.client('sagemaker')
    try:
        client.delete_endpoint(EndpointName='cold-sentiment-endpoint')
        print("已删除endpoint")
    except Exception as e:
        print(f"删除endpoint时出错: {str(e)}")

if __name__ == "__main__":
    try:
        # # 1. 部署模型
        print("正在部署模型...")
        # predictor = deploy_model()
        
        # 2. 进行评估
        print("开始评估...")
        results = evaluate_model()
        
        # # 3. 完成后清理资源
        # print("评估完成，正在清理资源...")
        # cleanup_endpoint()
        
        print("评估结果已保存到 model_evaluation_results.csv")
        
    except Exception as e:
        print(f"发生错误: {str(e)}")
        # 确保即使发生错误也清理资源
        # cleanup_endpoint()

正在部署模型...
开始评估...
开始评估...
进度: 0/1400
气死
预测结果 (index 0): {'label': 'LABEL_0', 'score': 0.60235196352005}
建议专心致志
预测结果 (index 1): {'label': 'LABEL_0', 'score': 0.9851343631744385}
活该
预测结果 (index 2): {'label': 'LABEL_1', 'score': 0.9762573838233948}
深入正在包在我身上稳步发展
预测结果 (index 3): {'label': 'LABEL_0', 'score': 0.9751445651054382}
逗逼
预测结果 (index 4): {'label': 'LABEL_0', 'score': 0.9659514427185059}
开始欢声笑语
预测结果 (index 5): {'label': 'LABEL_0', 'score': 0.9942438006401062}
全面顶一个
预测结果 (index 6): {'label': 'LABEL_0', 'score': 0.9917773008346558}
休息得好吗稳步发展全面展开
预测结果 (index 7): {'label': 'LABEL_0', 'score': 0.9922019243240356}
全面抱歉啊中
预测结果 (index 8): {'label': 'LABEL_0', 'score': 0.997026264667511}
点儿背
预测结果 (index 9): {'label': 'LABEL_0', 'score': 0.9675281047821045}
傻波
预测结果 (index 10): {'label': 'LABEL_0', 'score': 0.9663025736808777}
绝绝子完成全面展开
预测结果 (index 11): {'label': 'LABEL_0', 'score': 0.9794194102287292}
稳步建议欢迎关注持续推进
预测结果 (index 12): {'label': 'LABEL_0', 'score': 0.9954568147659302}
蓬勃发展
预测结果 (