In [None]:
from openai import OpenAI
import json
import os
from dotenv import load_dotenv

In [None]:
from openai import OpenAI
import json
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.getenv('DEEPSEEK_API_KEY'),
    base_url="https://api.deepseek.com"
)

def get_sentiment_probabilities(question):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": """你是一个情感分析助手。请分析输入文本的作者的情感倾向，
             并返回以下格式的JSON：
             {"negative": 数值, "neutral": 数值, "positive": 数值}
             这三个数值之和应该等于1。"""},
            {"role": "user", "content": question},
        ],
        stream=False
    )
    
    content = response.choices[0].message.content
    print("API 返回的原始内容:", content)  # 添加调试输出
    print('-'*20)
    
    try:
        # 尝试清理内容中可能的多余字符
        content = content.strip()
        if content.startswith('```json'):
            content = content[7:-3]  # 移除 ```json 和 ``` 标记
        content = content.strip()
        
        result = json.loads(content)
        return result
    except json.JSONDecodeError as e:
        print(f"JSON 解析错误: {e}")
        return None

# 测试代码
question = "我感觉cbs这里的数据生成不算难呀"
probabilities = get_sentiment_probabilities(question)
print("处理后的结果:")
print(probabilities)

In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/tanoManzo/mimic_attitude_dataset/data/train-00000-of-00001-7c2ffc1915509998.parquet")

In [None]:
import pandas as pd
from tqdm import tqdm

def analyze_df_sentiments(df):
    """
    Analyze sentiments of notes in the DataFrame using DeepSeek API
    
    Args:
        df: DataFrame with notes in the second column
    
    Returns:
        New DataFrame with sentiment probabilities added
    """
    # Create a new DataFrame to store results
    df_labeled = df.copy()
    
    # Add new columns for sentiment probabilities
    df_labeled['negative'] = None
    df_labeled['neutral'] = None
    df_labeled['positive'] = None
    
    # Process each row with progress bar
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Analyzing sentiments"):
        note = row.iloc[1]  # Get the note from the second column
        
        # Skip empty notes
        if pd.isna(note) or note == "":
            continue
            
        # Get sentiment probabilities using the existing function
        probabilities = get_sentiment_probabilities(note)
        
        # Update the DataFrame if valid results were obtained
        if probabilities and isinstance(probabilities, dict):
            df_labeled.at[index, 'negative'] = probabilities.get('negative', None)
            df_labeled.at[index, 'neutral'] = probabilities.get('neutral', None)
            df_labeled.at[index, 'positive'] = probabilities.get('positive', None)
    
    return df_labeled

# Example usage:
# Assuming df is your DataFrame with notes in the second column
# df_labeled = analyze_df_sentiments(df)
# df_labeled.to_csv('sentiment_labeled_data.csv', index=False)

In [None]:
import pandas as pd

# Load your DataFrame if it's not already loaded
# df = pd.read_csv('your_data.csv')

# Run the analysis 5 times and save with incrementing filenames
for i in range(1, 6):
    print(f"Running analysis iteration {i}/5...")
    
    # Process the DataFrame
    df_labeled = analyze_df_sentiments(df)
    
    # Display sample of results for this iteration
    print(f"Results from iteration {i}:")
    print(df_labeled.head())
    
    # Save to file with incrementing number
    output_filename = f'sentiment_labeled_deepseekv3_{i}.csv'
    df_labeled.to_csv(output_filename, index=False)
    print(f"Saved results to {output_filename}")
    print("-" * 50)

print("All iterations completed!")