In [None]:
import pandas as pd
import os

def process_stock_data(data_dir, stock_name, output_dir, resample_periods=["1min", "5min", "15min", "30min", "60min"]):
    df_all = pd.DataFrame()

    for root, dirs, files in os.walk(data_dir):
        txt_files = [f for f in files if f.endswith('.txt')]
        for file in txt_files:
            file_path = os.path.join(root, file)
            try:
                df = pd.read_csv(file_path, delimiter=",")
                df.columns = df.columns.str.strip()

                required_cols = [
                    "Date", "Time", "Last_Tick_Price",
                    "Total_Shares_Volume", "Total_Monetary_Volume", "Trade_Count"
                ]
                if not all(col in df.columns for col in required_cols):
                    print(f"跳过文件 {file}，缺少关键列: {set(required_cols) - set(df.columns)}")
                    continue

                df["Date"] = df["Date"].astype(str)
                df["Time"] = df["Time"].astype(int)
                df["Time"] = pd.to_timedelta(df["Time"], unit="s")
                df["Datetime"] = pd.to_datetime(df["Date"], format="%Y%m%d") + df["Time"]
                df = df.set_index("Datetime")
                df_all = pd.concat([df_all, df])
            except Exception as e:
                print(f"处理文件 {file} 时出错：{e}")
                continue

    print(f"\n {stock_name}：共合并 {len(df_all)} 条原始记录")

    df_all = df_all[
        (df_all["Last_Tick_Price"] > 0) &
        (df_all["Total_Shares_Volume"] > 0) &
        (df_all["Total_Monetary_Volume"] > 0) &
        (df_all["Trade_Count"] > 0)
    ]
    df_all = df_all.sort_index().dropna()

    for period in resample_periods:
        df_resampled = df_all.resample(period).agg({
            "Last_Tick_Price": "last",
            "Average_Price": "mean",
            "Median_Price": "mean",
            "Average_Log_Price": "mean",
            "Total_Shares_Volume": "sum",
            "Total_Monetary_Volume": "sum",
            "Trade_Count": "sum",
            "Volume_Weighted_Average_Price": "mean"
        })

        df_resampled.columns = [
            "ClosePrice", "AvgPrice", "MedianPrice", "LogAvgPrice",
            "TotalVolume", "TotalAmount", "TradeCount", "VWAP"
        ]

        df_resampled["FutureDirection"] = (df_resampled["ClosePrice"].shift(-1) > df_resampled["ClosePrice"]).astype(int)
        df_resampled = df_resampled.dropna()

        print(f"\n {stock_name} - {period} 样本数：{len(df_resampled)}")
        print(df_resampled["FutureDirection"].value_counts())
        print(df_resampled["FutureDirection"].value_counts(normalize=True))

        # 保存
        os.makedirs(output_dir, exist_ok=True)
        filename = f"{stock_name}_resampled_{period}_with_labels.csv"
        output_path = os.path.join(output_dir, filename)
        df_resampled.to_csv(output_path)
        print(f" 保存成功：{output_path}")

#  要处理的文件夹和股票名
stock_dirs = {
    "GE":  "D:\\zangyujiao\\Desktop\\data\\GE",
    "IBM": "D:\\zangyujiao\\Desktop\\data\\IBM",
    "JPM": "D:\\zangyujiao\\Desktop\\data\\JPM",
    "PFE": "D:\\zangyujiao\\Desktop\\data\\PFE",
    "PG":  "D:\\zangyujiao\\Desktop\\data\\PG"
}

#  指定统一输出目录
output_base = "D:\\zangyujiao\\Desktop\\resampled_output"

#  执行处理
for stock_name, folder in stock_dirs.items():
    output_dir = os.path.join(output_base, stock_name)
    process_stock_data(folder, stock_name, output_dir)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import os
import warnings

warnings.filterwarnings("ignore")

# 读取CSV数据
file_path = r"D:\zangyujiao\Desktop\resampled_output\GE\GE_resampled_1min_with_labels.csv"
df = pd.read_csv(file_path, index_col=0, parse_dates=True)

# 删除缺失High/Low因子部分
windows = [3, 5, 10, 20]
for window in windows:
    df[f"momentum_{window}"] = df["ClosePrice"] / df["ClosePrice"].shift(window) - 1
    df[f"cum_ret_{window}"] = np.log(df["ClosePrice"]).diff().rolling(window).sum()
    df[f"bias_{window}"] = df["ClosePrice"] / df["ClosePrice"].rolling(window).mean() - 1
    df[f"vol_mom_{window}"] = df["TotalVolume"] / df["TotalVolume"].shift(window) - 1
    df[f"amt_mom_{window}"] = df["TotalAmount"] / df["TotalAmount"].shift(window) - 1
    df[f"tradecount_mom_{window}"] = df["TradeCount"] / df["TradeCount"].shift(window) - 1
    df[f"volatility_{window}"] = df["ClosePrice"].pct_change().rolling(window).std()

# 组合因子
df["price_vol_combo"] = df["momentum_5"] * df["vol_mom_5"]
df["vol_price_divergence"] = df["vol_mom_5"] - df["momentum_5"]
df["mom_diff"] = df["momentum_20"] - df["momentum_5"]

# 去除NA
df.dropna(inplace=True)

# 因子列名
factor_cols = [col for col in df.columns if any(k in col for k in [
    "momentum", "bias", "ret", "vol_", "amt_", "tradecount", "combo", "diff"])]

# 分训练集和测试集
split_index = int(len(df) * 0.7)
df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]

# 标准化
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(df_train[factor_cols]), columns=factor_cols, index=df_train.index)
X_test = pd.DataFrame(scaler.transform(df_test[factor_cols]), columns=factor_cols, index=df_test.index)

# 标签
y_train = df_train["FutureDirection"]
y_test = df_test["FutureDirection"]

# 计算IC值
ic_values = {}
for col in factor_cols:
    ic = stats.spearmanr(X_test[col], y_test)[0]
    ic_values[col] = ic

# 可视化：IC条形图
plt.figure(figsize=(12, 6))
sorted_ic = dict(sorted(ic_values.items(), key=lambda item: abs(item[1]), reverse=True))
sns.barplot(x=list(sorted_ic.keys()), y=list(sorted_ic.values()))
plt.xticks(rotation=45, ha="right")
plt.title("Information Coefficient (IC) for Factors")
plt.ylabel("Spearman IC")
plt.tight_layout()
plt.grid(True)
plt.show()

# 分组收益分析（Top 3因子）
top3_factors = list(sorted_ic.keys())[:3]
group_returns = {}

for factor in top3_factors:
    df_test["factor_quantile"] = pd.qcut(X_test[factor], q=5, labels=False)
    group_ret = df_test.groupby("factor_quantile")["FutureDirection"].mean()
    group_returns[factor] = group_ret

# 绘图
for factor, ret in group_returns.items():
    plt.figure(figsize=(6, 4))
    sns.barplot(x=ret.index.astype(str), y=ret.values)
    plt.title(f"Future Up Probability by Factor Quantile: {factor}")
    plt.xlabel("Quantile (0=Lowest, 4=Highest)")
    plt.ylabel("Mean FutureDirection")
    plt.grid(True)
    plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# 建立模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 模型预测
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # 预测为上涨的概率

# 模型评估指标
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

print(" 多因子模型性能评估：")
print(f"Accuracy: {acc:.4f}")
print(f"AUC: {auc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 可视化：特征系数重要性
coef_df = pd.DataFrame({
    "Factor": X_train.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", key=abs, ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x="Coefficient", y="Factor", data=coef_df)
plt.title("Logistic Regression Coefficient Importance")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据
file_path = r"D:\zangyujiao\Desktop\resampled_output\GE\GE_resampled_1min_with_labels.csv"
df = pd.read_csv(file_path, index_col=0, parse_dates=True)

# 构造动量类因子
windows = [3, 5, 10, 20]
for window in windows:
    df[f"momentum_{window}"] = df["ClosePrice"] / df["ClosePrice"].shift(window) - 1
    df[f"cum_ret_{window}"] = np.log(df["ClosePrice"]).diff().rolling(window).sum()
    df[f"bias_{window}"] = df["ClosePrice"] / df["ClosePrice"].rolling(window).mean() - 1
    df[f"vol_mom_{window}"] = df["TotalVolume"] / df["TotalVolume"].shift(window) - 1
    df[f"amt_mom_{window}"] = df["TotalAmount"] / df["TotalAmount"].shift(window) - 1
    df[f"tradecount_mom_{window}"] = df["TradeCount"] / df["TradeCount"].shift(window) - 1
    df[f"volatility_{window}"] = df["ClosePrice"].pct_change().rolling(window).std()

# 构造组合因子
df["price_vol_combo"] = df["momentum_5"] * df["vol_mom_5"]
df["vol_price_divergence"] = df["vol_mom_5"] - df["momentum_5"]
df["mom_diff"] = df["momentum_20"] - df["momentum_5"]

# 去除NA
df.dropna(inplace=True)

# 收集因子列
factor_cols = [col for col in df.columns if any(k in col for k in [
    "momentum", "bias", "ret", "vol_", "amt_", "tradecount", "combo", "diff"])]

# 划分训练测试集
split_index = int(len(df) * 0.7)
df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]

X_train = df_train[factor_cols]
y_train = df_train["FutureDirection"]

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# LassoCV 模型进行变量选择
lasso = LassoCV(cv=5, random_state=42).fit(X_train_scaled, y_train)

# 系数结果
lasso_coef = pd.Series(lasso.coef_, index=factor_cols)
lasso_coef_filtered = lasso_coef[lasso_coef != 0]

# 可视化
plt.figure(figsize=(10, 6))
lasso_coef_filtered.sort_values().plot(kind='barh')
plt.title("LassoCV Selected Factor Coefficients")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# 策略函数
def run_strategy_on_file(file_path, plot=True):
    try:
        df = pd.read_csv(file_path, index_col=0, parse_dates=True)

        windows = [3, 5, 10, 20]
        for window in windows:
            df[f"momentum_{window}"] = df["ClosePrice"] / df["ClosePrice"].shift(window) - 1
            df[f"bias_{window}"] = df["ClosePrice"] / df["ClosePrice"].rolling(window).mean() - 1
            df[f"vol_mom_{window}"] = df["TotalVolume"] / df["TotalVolume"].shift(window) - 1

        df["alpha_score"] = (
            0.4 * df["momentum_5"].fillna(0) +
            0.3 * df["bias_10"].fillna(0) +
            0.3 * df["vol_mom_10"].fillna(0)
        )

        df["signal"] = 0
        df.loc[df["alpha_score"] > df["alpha_score"].quantile(0.8), "signal"] = 1
        df.loc[df["alpha_score"] < df["alpha_score"].quantile(0.2), "signal"] = -1

        df["strategy_return"] = df["signal"].shift(1) * df["ClosePrice"].pct_change()
        df["cumulative_strategy_return"] = (1 + df["strategy_return"]).cumprod()
        df["buy_and_hold"] = (1 + df["ClosePrice"].pct_change()).cumprod()

        if plot:
            plt.figure(figsize=(10, 5))
            plt.plot(df["cumulative_strategy_return"], label="Strategy")
            plt.plot(df["buy_and_hold"], label="Buy & Hold", linestyle='--')
            plt.title(f"Strategy vs Buy & Hold\n{os.path.basename(file_path)}")
            plt.xlabel("Time")
            plt.ylabel("Cumulative Return")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

        return df

    except Exception as e:
        print(f" 文件 {file_path} 处理失败：{e}")
        return None
# 指定文件夹路径（根目录）
root_dir = r"D:\zangyujiao\Desktop\resampled_output"

# 遍历所有子目录下的CSV文件
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".csv") and "with_labels" in file:
            file_path = os.path.join(root, file)
            print(f"\n 正在处理：{file_path}")
            _ = run_strategy_on_file(file_path, plot=True)
