In [2]:
import sys
import pymssql
import pandas as pd
import numpy as np
import functools
from scipy.stats import *
from dateutil.relativedelta import relativedelta
from sqlalchemy import create_engine
from joblib import dump, load
import pickle
import os

In [None]:
# 测试代码：数据库是否连接成功
connection_string = 'mssql+pymssql://gta:jjxy.2018@210.34.5.211/SEL1_TAQ_200506'
engine = create_engine(connection_string)

try:
    with engine.connect() as connection:
        result = connection.execute(text("SELECT 1"))
        print("数据库连接成功！")
except Exception as e:
    print(f"数据库连接失败: {e}")

In [None]:
def get_data(stickers, month):
    # 创建数据库连接字符串
    connection_string = f'mssql+pymssql://gta:jjxy.2018@210.34.5.211/SEL1_TAQ_{month}'
    
    # 使用 SQLAlchemy 创建引擎
    engine = create_engine(connection_string)
    
    # 用于存储所有结果的列表（性能优化）
    all_data_list = []

    # 用于存储出现错误的 stickers 按 month 分类
    error_info = {}
    
    # 遍历每个股票代码
    for sticker in stickers:
        # 确定 location
        location = 'Z' if sticker.startswith('0') else 'H'
        
        # 构建 SQL 查询
        sql = f"""
        WITH TradingData AS (
    SELECT 
        Symbol,
        TradingDate,
        CASE 
            WHEN CAST(TradingTime AS TIME) BETWEEN '09:29:00' AND '10:00:00' THEN '10:00'
            WHEN CAST(TradingTime AS TIME) BETWEEN '09:59:00' AND '10:30:00' THEN '10:30'
            WHEN CAST(TradingTime AS TIME) BETWEEN '10:29:00' AND '11:00:00' THEN '11:00'
            WHEN CAST(TradingTime AS TIME) BETWEEN '11:00:00' AND '11:30:00' THEN '11:30'
            WHEN CAST(TradingTime AS TIME) BETWEEN '13:00:00' AND '13:30:00' THEN '13:30'
            WHEN CAST(TradingTime AS TIME) BETWEEN '13:30:00' AND '14:00:00' THEN '14:00'
            WHEN CAST(TradingTime AS TIME) BETWEEN '14:00:00' AND '14:30:00' THEN '14:30'
            WHEN CAST(TradingTime AS TIME) BETWEEN '14:30:00' AND '15:01:00' THEN '15:00'
        END AS TimeSlot,
        TradeVolume,
        BuyorSell
    FROM 
        dbo.S{location}L1_TAQ_{sticker}_{month}
)
SELECT 
    Symbol,
    CAST(TradingDate AS VARCHAR(8)) + ' ' + TimeSlot AS TimeSlot,
    SUM(CASE WHEN BuyorSell = 'B' THEN TradeVolume ELSE 0 END) AS BuyVol,
    SUM(CASE WHEN BuyorSell = 'S' THEN TradeVolume ELSE 0 END) AS SellVol
FROM 
    TradingData
WHERE 
    TimeSlot IS NOT NULL  -- Exclude records that don’t fall into any time slot
GROUP BY 
    Symbol, TradingDate, TimeSlot
ORDER BY 
    TradingDate, TimeSlot;
        """
        
        try:
            # 执行 SQL 查询并获取结果
            df = pd.read_sql_query(sql, engine)
            print(df)
            # 检查是否为空数据框
            if df.empty:
                print(f"No data for sticker {sticker}")
                continue
            
            # 将结果添加到 all_data_list 中（避免每次使用 pd.concat）
            all_data_list.append(df)
            
        except Exception as e:
            print(f"Error with sticker {sticker}: {e}")
            # 如果当前 month 不在 error_info 中，则初始化它
            if month not in error_info:
                error_info[month] = []
            # 将出错的 sticker 添加到该 month 对应的列表中
            error_info[month].append(sticker)
            continue
    
    
    # 关闭数据库连接
    engine.dispose()
    
    # 返回所有数据和错误信息
    return all_data_list, error_info

In [None]:
components = pd.read_pickle('monthly_components_slice_2.pkl')

In [None]:
# 遍历 component_df
error_total = []
for index, row in components.iterrows():
    stickers = row['Components']
    month = row['month']
    data,error = get_data(stickers, month)
    error_total.append(error)
    filtered_data = [df for df in data if len(df) >= 100]
    # Use the date column value for the filename
    data_name = f'hs300_data_new/{month}.joblib'
    dump(filtered_data, data_name)
    
error_name = f'hs300_data_new/error.joblib'
dump(error_total, error_name)