In [2]:
import pandas as pd


In [3]:
#load data from parquet file
def load_data(file_path):
    """
    Load data from a parquet file.
    
    Parameters:
    file_path (str): The path to the parquet file.
    
    Returns:
    pd.DataFrame: The loaded data as a pandas DataFrame.
    """
    try:
        data = pd.read_parquet(file_path)
        return data
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None

In [4]:
text_df = load_data("./mag7_text.parquet")

In [5]:
text_df['AAPL'][100]

  text_df['AAPL'][100]


'经过前复权之后，2006-08-14 AAPL 开盘价是 1.92，收盘价是 1.92，成交额是 717620400'

In [6]:
import json
import os
import pandas as pd
from ollama import Client

ollama_client = Client()
system_prompt = """
你是顶级股票操盘手，注意你需要全程使用中文。请回忆该特定时间点之前一年以内：对该股票影响最大的财经信息和宏观数据，以及对该公司的深度洞察。
输出格式是python dict。注意只能输出python dict格式的内容，不能输出其他内容。
"""

def save_progress(checkpoint_data, checkpoint_file="checkpoint.json"):
    """保存当前进度到文件"""
    with open(checkpoint_file, 'w', encoding='utf-8') as f:
        json.dump(checkpoint_data, f, ensure_ascii=False, indent=2)

def load_progress(checkpoint_file="checkpoint.json"):
    """加载保存的进度"""
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {"results": {}}

def save_results(generated_df, output_file="output_results.json"):
    """保存最终结果到 JSON 文件，格式与输入 DataFrame 一致"""
    generated_df.to_json(output_file, orient='index', force_ascii=False, indent=2)

def generate_texts_with_ollama(texts, timestamps, ticker, model="gemma3:4b", max_retries=3, batch_size=10, checkpoint_file="checkpoint.json"):
    """
    Use Ollama to generate texts in batches with checkpointing
    Args:
        texts: List of input texts
        timestamps: List of corresponding timestamps
        ticker: Current ticker being processed
        model: Model name, default is gemma3-27b
        max_retries: Maximum number of retries per text
        batch_size: Number of texts to process in one batch
        checkpoint_file: File to save progress
    Returns:
        List of dictionaries with timestamps and generated texts
    """
    # 加载已有的进度
    checkpoint_data = load_progress(checkpoint_file)
    generated_texts = checkpoint_data["results"].get(ticker, [])
    processed_timestamps = {item["timestamp"] for item in generated_texts}
    
    # 过滤掉已处理的文本
    filtered_texts = [text for text, ts in zip(texts, timestamps) if str(ts) not in processed_timestamps]
    filtered_timestamps = [ts for ts in timestamps if str(ts) not in processed_timestamps]
    
    # Process texts in batches
    for i in range(0, len(filtered_texts), batch_size):
        batch_texts = filtered_texts[i:i + batch_size]
        batch_timestamps = filtered_timestamps[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1} of {len(filtered_texts)//batch_size + 1} for {ticker}...")
        
        for text, timestamp in zip(batch_texts, batch_timestamps):
            for attempt in range(max_retries):
                try:
                    response = ollama_client.chat(
                        model=model,
                        messages=[{"role": "system", "content": system_prompt},
                                {"role": "user", "content": text}],
                        options={
                            "num_ctx": 1000,
                            "temperature": 0.01
                        }
                    )
                    generated_text = {
                        "timestamp": str(timestamp),
                        "generated": response['message']['content']
                    }
                    #print(f"Generated text for {ticker} at timestamp {timestamp}: {response['message']['content']}")
                    generated_texts.append(generated_text)
                    # 更新 checkpoint_data 并保存
                    checkpoint_data["results"][ticker] = generated_texts
                    save_progress(checkpoint_data, checkpoint_file)
                    break
                except Exception as e:
                    print(f"Error processing text for {ticker}: {e}. Attempt {attempt + 1}/{max_retries}")
                    if attempt == max_retries - 1:
                        generated_text = {
                            "timestamp": str(timestamp),
                            "generated": f"Failed to generate: {text}"
                        }
                        generated_texts.append(generated_text)
                        checkpoint_data["results"][ticker] = generated_texts
                        save_progress(checkpoint_data, checkpoint_file)
    
    return generated_texts

def process_text_data(text_df, batch_size=10, checkpoint_file="checkpoint.json", output_file="output_results.json"):
    """
    Process DataFrame and generate texts in batches with checkpointing
    Args:
        text_df: DataFrame with timestamp index and ticker columns
        batch_size: Number of texts to process in one batch
        checkpoint_file: File to save progress
        output_file: File to save final results
    Returns:
        DataFrame with generated texts, same structure as input
    """
    # 初始化输出 DataFrame，保持与输入相同的索引和列
    generated_df = pd.DataFrame(index=text_df.index, columns=text_df.columns)
    
    # 加载已有的进度
    checkpoint_data = load_progress(checkpoint_file)
    
    for ticker in text_df.columns:
        # 加载已生成的进度
        if ticker in checkpoint_data["results"]:
            print(f"Loading saved results for {ticker}...")
            for item in checkpoint_data["results"][ticker]:
                timestamp = item["timestamp"]
                if timestamp in generated_df.index:
                    generated_df.at[timestamp, ticker] = item["generated"]
        
        # 获取未处理的文本和时间戳
        texts = text_df[ticker].dropna().tolist()
        timestamps = text_df.index[text_df[ticker].notna()].tolist()
        
        if not texts:
            print(f"No valid texts for {ticker}, skipping...")
            continue
            
        print(f"Generating texts for {ticker}...")
        # 生成新文本
        generated_texts = generate_texts_with_ollama(
            texts, 
            timestamps,
            ticker,
            batch_size=batch_size, 
            checkpoint_file=checkpoint_file
        )
        
        # 将生成的结果填入 DataFrame
        for item in generated_texts:
            timestamp = item["timestamp"]
            if timestamp in generated_df.index:
                generated_df.at[timestamp, ticker] = item["generated"]
        
        # 更新 checkpoint_data
        checkpoint_data["results"][ticker] = [
            {"timestamp": str(ts), "generated": generated_df.at[ts, ticker]}
            for ts in generated_df.index
            if pd.notna(generated_df.at[ts, ticker])
        ]
        save_progress(checkpoint_data, checkpoint_file)
    
    # 保存最终结果
    save_results(generated_df, output_file)
    return generated_df

# 假设 text_df 是输入 DataFrame
llm_output_texts = process_text_data(
    text_df,  # 注意：需要确保 text_df 是 Pandas DataFrame
    batch_size=4, 
    checkpoint_file="checkpoint.json", 
    output_file="output_results.json"
)

Loading saved results for AAPL...
Generating texts for AAPL...
Loading saved results for MSFT...
Generating texts for MSFT...
Loading saved results for GOOGL...
Generating texts for GOOGL...
Loading saved results for AMZN...
Generating texts for AMZN...
Loading saved results for NVDA...
Generating texts for NVDA...
Loading saved results for META...
Generating texts for META...
Processing batch 1 of 217 for META...
Processing batch 2 of 217 for META...
Processing batch 3 of 217 for META...
Processing batch 4 of 217 for META...
Processing batch 5 of 217 for META...
Processing batch 6 of 217 for META...
Processing batch 7 of 217 for META...
Processing batch 8 of 217 for META...
Processing batch 9 of 217 for META...
Processing batch 10 of 217 for META...
Processing batch 11 of 217 for META...
Processing batch 12 of 217 for META...
Processing batch 13 of 217 for META...
Processing batch 14 of 217 for META...
Processing batch 15 of 217 for META...
Processing batch 16 of 217 for META...
Proce

In [7]:
#load json file to data frame
import json
with open('./output_results.json', 'r') as f:
    data = json.load(f)
    data = pd.DataFrame(data)
    

In [12]:
#transform index to datetime index
df = data.T
df.index = pd.to_datetime(df.index, unit='ms')

  df.index = pd.to_datetime(df.index, unit='ms')


In [19]:
df.to_parquet('mag7_aigc_results.parquet')

In [None]:
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
system_prompt = """
你是顶级股票操盘手，注意你需要全程使用中文。请回忆该特定时间点之前一年以内：对该股票影响最大的财经信息和宏观数据，以及对该公司的深度洞察。
输出格式是python dict。注意只能输出python dict格式的内容，不能输出其他内容。
"""
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="LLM-Research/gemma-3-4b-it",
    messages=[{"role": "system", "content": system_prompt},
        {"role": "user", "content": "经过前复权之后，2006-08-14 AAPL 开盘价是 1.92，收盘价是 1.92，成交额是 717620400"},
    ],
    max_tokens=1600,
    temperature=0.01,
    top_p=0.95,
    extra_body={
        "top_k": 20,
    },
)
print("Chat response:", chat_response)

In [None]:
print(chat_response.choices[0].message.content)