In [1]:
import multiprocessing as mp
import statsmodels.api as sm
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os
import glob
import warnings
warnings.filterwarnings('ignore')

path = '../data/data_5m'
all_files = glob.glob(path + "/*.csv")

# Sample of Daily Correlation between close and volume

In [None]:
sample_df = pd.read_csv('../data/data_5m/2021-01-04.csv')
sample_df

In [None]:

cor = sample_df.groupby('code').apply(lambda x: x['close'].corr(x['volume']))
cor_df = cor.to_frame().reset_index().rename(columns={0:'close_volume_corr'})
cor_df

# Calculate Factor
## Calculate the correlation in day

In [None]:
import ray
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os
import glob
import warnings
warnings.filterwarnings('ignore')

ray.init() 

@ray.remote
def my_factor(filename, output_folder):
    df = pd.read_csv(filename)
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # 拆分datetime列为date和time两列
    df['date'] = df['datetime'].dt.strftime('%Y-%m-%d')
    df['time'] = df['datetime'].dt.strftime('%H:%M:%S')
    
    # 取第一个 date 作为输出文件名的一部分
    date = df['date'].iloc[0]

    df['vwap'] = df['money'] / df['volume']
    result_df = df.copy()

    # 分组计算 close 与 volume 的相关系数
    tmp = result_df.groupby('code').apply(lambda x: x['close'].corr(x['volume']))
    result = tmp.to_frame().reset_index().rename(columns={0: 'close_volume_corr'})

    # 创建输出文件夹（如果不存在）
    os.makedirs(output_folder, exist_ok=True)

    # 输出 CSV
    final = result.loc[:, ['code', 'close_volume_corr']]
    filename_out = os.path.join(output_folder, f'{date}.csv')
    final = final.sort_values(by='code', ascending=True)
    final.to_csv(filename_out, index=False)



path = '../data/data_5m'
output_folder = '../data/factors/CPV/close_volume_corr'
    
# 收集所有 CSV 文件
all_files = glob.glob(path + "/*.csv")
    
s = time.time()
    
# 将任务并行提交给 Ray
results = []
for filename in tqdm(all_files):
    results.append(my_factor.remote(filename, output_folder))
    
# 等待所有任务完成
ray.get(results)
    
e = time.time()
print(f'耗费 {e - s} 秒')


In [2]:
import ray

#新生成字段与原始日频字段合成

daily_path = '../data/data_daily'
cvc_path = '../data/factors/CPV/close_volume_corr'
output_folder = '../data/factors/CPV/new_daily'
ray.shutdown()



#初始化ray
ray.init(num_cpus=4,num_gpus=0)
@ray.remote
def my_factor(filename):
    filename_min = cvc_path + '/' + filename   #读取分钟数据生成的字段
    filename_daily = daily_path + '/' + filename   #读取原始的日频数据
    df_min = pd.read_csv(filename_min)
    df_daily = pd.read_csv(filename_daily)
    date = df_daily['date'].values[0]
    merged = pd.merge(df_min, df_daily, on='code', how='inner')    #拼接
    
    final = merged.copy()
    
    # 创建文件夹（如果不存在）
    os.makedirs(output_folder, exist_ok=True)
    
    filename = os.path.join(output_folder, f'{date}.csv')
    final = final.sort_values(by='code', ascending=True)
    final.to_csv(filename, index=False)
    return 


s=time.time()
res = []

for file_name in tqdm(all_files):
    file_name = os.path.basename(file_name)
    res.append(my_factor.remote(file_name))
results = ray.get(res)
e=time.time()
print(f'耗费{e-s}秒')

#重复运行必须要把ray关掉
ray.shutdown()

2024-12-31 11:15:00,093	INFO worker.py:1821 -- Started a local Ray instance.
100%|██████████| 243/243 [00:00<00:00, 8931.87it/s]


耗费3.2980921268463135秒


## Calculate PV_corr

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import glob
import warnings
warnings.filterwarnings('ignore')


#data
path = "../data/factors/CPV/new_daily"
all_files = glob.glob(path + "/*.csv")   #把所有的数据文件文件名读取在一起

li = []


for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)


frame = pd.concat(li, axis=0, ignore_index=True)
result_df = frame.sort_values(by=['code', 'date'], ascending=True)
result_df.index = range(len(result_df))

100%|██████████| 243/243 [00:01<00:00, 162.00it/s]


In [4]:
result_df

Unnamed: 0,code,close_volume_corr,date,open,close,low,high,volume,money,turnover_ratio
0,000001.XSHE,0.126032,2021-01-04,2306.70,2246.31,2226.99,2306.70,1286929.0,2.891682e+09,0.8009
1,000001.XSHE,0.089958,2021-01-05,2222.16,2194.38,2149.70,2231.82,1508123.0,3.284607e+09,0.9386
2,000001.XSHE,-0.216109,2021-01-06,2183.51,2362.25,2173.85,2362.25,1602181.0,3.648522e+09,0.9971
3,000001.XSHE,0.088208,2021-01-07,2357.42,2403.31,2322.40,2412.97,1311744.0,3.111275e+09,0.8163
4,000001.XSHE,0.569952,2021-01-08,2403.31,2397.27,2332.06,2427.47,989881.0,2.348316e+09,0.6160
...,...,...,...,...,...,...,...,...,...,...
1058890,689009.XSHG,0.277105,2021-12-27,65.00,67.08,63.56,68.43,3163564.0,2.086403e+08,0.7141
1058891,689009.XSHG,0.473949,2021-12-28,66.60,68.15,66.16,68.95,3561048.0,2.416030e+08,0.8039
1058892,689009.XSHG,0.520082,2021-12-29,68.50,67.58,66.64,69.68,3363471.0,2.282845e+08,0.7593
1058893,689009.XSHG,0.337465,2021-12-30,67.99,68.24,66.01,68.69,3059763.0,2.060038e+08,0.6907


### AVG

In [5]:
%%time 
pv_corr_avg = result_df.groupby('code').apply(lambda x: x['close_volume_corr'].rolling(window=20).mean())

CPU times: total: 109 ms
Wall time: 704 ms


In [7]:
pv_corr_avg = pv_corr_avg.reset_index(level=[0,1])
pv_corr_avg = pv_corr_avg.drop(columns=['level_1'])
pv_corr_avg['date'] = result_df['date']
pv_corr_avg.columns = ['code','PV_corr_avg','date']

In [8]:
pv_corr_avg

Unnamed: 0,code,PV_corr_avg,date
0,000001.XSHE,,2021-01-04
1,000001.XSHE,,2021-01-05
2,000001.XSHE,,2021-01-06
3,000001.XSHE,,2021-01-07
4,000001.XSHE,,2021-01-08
...,...,...,...
1058890,689009.XSHG,0.139671,2021-12-27
1058891,689009.XSHG,0.145080,2021-12-28
1058892,689009.XSHG,0.185543,2021-12-29
1058893,689009.XSHG,0.183005,2021-12-30


In [9]:
#将日期列转换为datetime类型
df = pv_corr_avg
df['date']=pd.to_datetime(df['date'])

#保存文件夹路径
output_folder='../data/factors/CPV/pv_corr_avg'
selected_columns=['code', 'date', 'PV_corr_avg']

os.makedirs(output_folder,exist_ok=True)

#创建日期索引并检查是否在dateframe中存在
date_index=pd.date_range(df['date'].min(),df['date'].max(),freq='D')
existing_dates=[d for d in date_index if d in df['date'].values]

#遍历每个日期如果它在dataframe中存在 则将其保存为单独的csv文件
for date in tqdm(existing_dates):
    group=df.loc[df['date']==date,selected_columns]
    filename=os.path.join(output_folder,f'{date.strftime("%Y-%m-%d")}.csv')
    group=group.sort_values(by='code',ascending=True)
    group = group.drop(columns = ['date'])
    group.to_csv(filename,index=False)

100%|██████████| 243/243 [00:05<00:00, 45.17it/s]


### STD

In [10]:
%%time
pv_corr_std = result_df.groupby('code').apply(lambda x: x['close_volume_corr'].rolling(20).std())

CPU times: total: 172 ms
Wall time: 727 ms


In [11]:
pv_corr_std = pv_corr_std.reset_index(level=[0,1])
pv_corr_std = pv_corr_std.drop(columns=['level_1'])
pv_corr_std['date'] = result_df['date']
pv_corr_std.columns = ['code','PV_corr_std','date']

In [13]:
#将日期列转换为datetime类型
df = pv_corr_std
df['date']=pd.to_datetime(df['date'])

#保存文件夹路径
output_folder='../data/factors/CPV/pv_corr_std'
selected_columns=['code', 'date', 'PV_corr_std']

os.makedirs(output_folder,exist_ok=True)

#创建日期索引并检查是否在dateframe中存在
date_index=pd.date_range(df['date'].min(),df['date'].max(),freq='D')
existing_dates=[d for d in date_index if d in df['date'].values]

#遍历每个日期如果它在dataframe中存在 则将其保存为单独的csv文件
for date in tqdm(existing_dates):
    group=df.loc[df['date']==date,selected_columns]
    filename=os.path.join(output_folder,f'{date.strftime("%Y-%m-%d")}.csv')
    group=group.sort_values(by='code',ascending=True)
    group = group.drop(columns = ['date'])
    group.to_csv(filename,index=False)

100%|██████████| 243/243 [00:05<00:00, 43.95it/s]


In [17]:
reshaped_pv_corr_avg = pv_corr_avg.pivot(index='date', columns='code', values='PV_corr_avg')
reshaped_pv_corr_avg

code,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,688787.XSHG,688788.XSHG,688789.XSHG,688793.XSHG,688798.XSHG,688799.XSHG,688800.XSHG,688819.XSHG,688981.XSHG,689009.XSHG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,,,,,,,,,,,...,,,,,,,,,,
2021-01-05,,,,,,,,,,,...,,,,,,,,,,
2021-01-06,,,,,,,,,,,...,,,,,,,,,,
2021-01-07,,,,,,,,,,,...,,,,,,,,,,
2021-01-08,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,-0.051534,0.163533,0.076077,0.135524,0.071639,,0.008492,0.045230,0.006421,0.109204,...,0.070097,0.046707,-0.032893,-0.025491,0.000388,0.022530,-0.028139,0.092187,0.105064,0.139671
2021-12-28,-0.066040,0.138621,0.085150,0.138100,0.083170,-0.072575,0.018580,0.051325,0.035796,0.106983,...,0.093073,0.019502,-0.031776,-0.019993,0.005323,0.045577,-0.054625,0.083481,0.152655,0.145080
2021-12-29,-0.064941,0.123917,0.085946,0.131455,0.071222,-0.029549,0.012524,0.038381,0.023810,0.093946,...,0.067301,0.019380,-0.013775,0.020364,-0.009274,0.082941,-0.030276,0.088906,0.124958,0.185543
2021-12-30,-0.071980,0.118641,0.067530,0.100831,0.064704,0.004830,0.015953,0.035000,0.029611,0.059875,...,0.085032,-0.010116,0.001732,0.003161,-0.038858,0.075015,-0.058201,0.059696,0.095351,0.183005


In [15]:
reshaped_pv_corr_std = pv_corr_std.pivot(index='date', columns='code', values='PV_corr_std')
reshaped_pv_corr_std.tail()

code,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,688787.XSHG,688788.XSHG,688789.XSHG,688793.XSHG,688798.XSHG,688799.XSHG,688800.XSHG,688819.XSHG,688981.XSHG,689009.XSHG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-27,0.202257,0.309516,0.389075,0.209067,0.285244,,0.171103,0.358576,0.269714,0.284124,...,0.334939,0.351788,0.279952,0.3013,0.322441,0.278742,0.278025,0.242174,0.369552,0.302869
2021-12-28,0.18599,0.306472,0.401803,0.211675,0.283379,0.328597,0.180307,0.363932,0.279334,0.282978,...,0.333395,0.345592,0.278962,0.301194,0.325627,0.250095,0.280662,0.236865,0.39323,0.308041
2021-12-29,0.186048,0.293335,0.400993,0.211701,0.289547,0.318061,0.185009,0.35688,0.280728,0.292962,...,0.341331,0.345448,0.247895,0.292059,0.317483,0.242216,0.288391,0.234736,0.385008,0.301068
2021-12-30,0.196027,0.295273,0.384929,0.191055,0.290264,0.338227,0.189953,0.35455,0.27625,0.316843,...,0.35595,0.31163,0.23519,0.302678,0.286907,0.24552,0.278107,0.24403,0.3745,0.299479
2021-12-31,0.200579,0.298099,0.379766,0.190601,0.290517,0.343786,0.194373,0.345989,0.284278,0.323328,...,0.345623,0.309952,0.235366,0.299875,0.301818,0.240956,0.275679,0.234761,0.354345,0.305906


In [28]:
ret_df = pd.read_pickle('../data\\factors\\CPV\\ret_df.pkl')
mean_ret_20 = ret_df.rolling(20).mean()
mean_ret_20.index = reshaped_pv_corr_avg.index

In [33]:
import statsmodels.api as sm
def cs_regression(df1, df2):
    """
    对两个DataFrame的每一行执行截面回归，并返回一个包含残差的新DataFrame。
    如果存在inf或nan值，使用行的平均值（排除inf和nan）进行填充。
    如果一行全部是nan，则不进行回归，直接返回nan值。

    :param df1: DataFrame，包含自变量。
    :param df2: DataFrame，包含因变量。
    :return: 包含残差的DataFrame。
    """
    # 确保两个DataFrame的索引相同
    if not df1.index.equals(df2.index):
        raise ValueError("两个DataFrame的索引必须相同。")

    # 存储残差的列表
    residuals_list = []

    # 对每一行进行回归
    for idx in df1.index:
        # 提取自变量和因变量
        X = df1.loc[idx, :].replace([np.inf, -np.inf], np.nan)
        y = df2.loc[idx, :].replace([np.inf, -np.inf], np.nan)

        # 检查是否整行都是nan
        if X.isna().all() or y.isna().all():
            # 整行都是nan，则添加一个全为nan的残差
            residuals_list.append([np.nan] * len(X))
            continue

        # 计算行的平均值并填充nan值
        X = X.fillna(X.mean())
        y = y.fillna(y.mean())

        # 添加截距
        X = sm.add_constant(X)

        # 执行回归
        model = sm.OLS(y, X).fit()

        # 收集残差
        residuals = model.resid
        residuals_list.append(residuals)

    # 创建残差DataFrame
    residuals_df = pd.DataFrame(residuals_list, index=df1.index, columns = df1.columns)

    return residuals_df

In [34]:
reshaped_pv_corr_avg_deRet20 = cs_regression(mean_ret_20, reshaped_pv_corr_avg)
reshaped_pv_corr_std_deRet20 = cs_regression(mean_ret_20, reshaped_pv_corr_std)

In [36]:
def cs_standard(df):#cross sectional
    return df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0)

In [37]:
standard_pv_corr_avg_deRet20 = cs_standard(reshaped_pv_corr_avg_deRet20)
standard_pv_corr_std_deRet20 = cs_standard(reshaped_pv_corr_std_deRet20)

In [39]:
pv_corr_deRet20 = standard_pv_corr_avg_deRet20 + standard_pv_corr_std_deRet20
pv_corr_deRet20.tail()

code,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,688787.XSHG,688788.XSHG,688789.XSHG,688793.XSHG,688798.XSHG,688799.XSHG,688800.XSHG,688819.XSHG,688981.XSHG,689009.XSHG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-27,-3.759025,1.306271,1.762981,-1.111563,-0.430749,0.013576,-3.709716,0.825729,-1.596526,0.042818,...,0.593357,0.663205,-1.857399,-1.387579,-0.581555,-1.196485,-1.936133,-0.952967,1.780822,0.829716
2021-12-28,-4.317275,0.854472,2.111817,-1.081537,-0.366984,-1.488831,-3.449062,0.990456,-1.035865,-0.06791,...,0.842518,0.13375,-1.868666,-1.380335,-0.493154,-1.528089,-2.329366,-1.213472,2.867206,0.935641
2021-12-29,-4.257892,0.435211,2.100736,-1.124507,-0.375523,-1.081226,-3.447286,0.721841,-1.13574,-0.006214,...,0.646608,0.177061,-2.197075,-1.005086,-0.81402,-1.15368,-1.83063,-1.14624,2.358509,1.335555
2021-12-30,-4.041682,0.514819,1.627227,-1.854372,-0.341579,-0.118674,-3.235179,0.736237,-1.042144,0.12296,...,1.266745,-0.808258,-2.162685,-0.950874,-1.719848,-1.111678,-2.267835,-1.246269,1.863705,1.371667
2021-12-31,-4.273643,0.657281,1.855748,-1.857442,-0.282448,-0.112043,-3.119478,0.297379,-0.953398,0.627644,...,0.910022,-1.001694,-2.257605,-1.124205,-1.946022,-0.988521,-2.112674,-1.348785,1.091513,1.713625


In [41]:
long_format_pv_corr_deRet20 = pv_corr_deRet20.reset_index().melt(id_vars=['date'], var_name='code', value_name='pv_corr_deRet20')
long_format_pv_corr_deRet20

Unnamed: 0,date,code,pv_corr_deRet20
0,2021-01-04,000001.XSHE,
1,2021-01-05,000001.XSHE,
2,2021-01-06,000001.XSHE,
3,2021-01-07,000001.XSHE,
4,2021-01-08,000001.XSHE,
...,...,...,...
1122412,2021-12-27,689009.XSHG,0.829716
1122413,2021-12-28,689009.XSHG,0.935641
1122414,2021-12-29,689009.XSHG,1.335555
1122415,2021-12-30,689009.XSHG,1.371667


In [43]:
#将日期列转换为datetime类型
df = long_format_pv_corr_deRet20
df['date']=pd.to_datetime(df['date'])

#保存文件夹路径
output_folder = '../data/factors/CPV/pv_corr_deRet20'
selected_columns=['code', 'date', 'pv_corr_deRet20']

os.makedirs(output_folder,exist_ok=True)

#创建日期索引并检查是否在dateframe中存在
date_index=pd.date_range(df['date'].min(),df['date'].max(),freq='D')
existing_dates=[d for d in date_index if d in df['date'].values]

#遍历每个日期如果它在dataframe中存在 则将其保存为单独的csv文件
for date in tqdm(existing_dates):
    group=df.loc[df['date']==date,selected_columns]
    filename=os.path.join(output_folder,f'{date.strftime("%Y-%m-%d")}.csv')
    group=group.sort_values(by='code',ascending=True)
    group = group.drop(columns = ['date'])
    group.to_csv(filename,index=False)

100%|██████████| 243/243 [00:05<00:00, 41.71it/s]


In [46]:
def cs_standard(df):
    return df.sub(df.mean(axis=1), axis=0).div(df.std(axis=1), axis=0)
standard_pv_corr_avg = cs_standard(reshaped_pv_corr_avg)
standard_pv_corr_std = cs_standard(reshaped_pv_corr_std)

In [53]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm

def rolling_regression_for_stock(stock, stock_data):
    """
    对单只股票的相关系数进行滚动时序回归。

    :param stock: 股票代码。
    :param stock_data: DataFrame，包含股票的相关系数数据。
    :return: 回归系数列表。
    """
    stock_df = stock_data[stock]
    coefficients = []

    for i in range(len(stock_df) - 19):
        window = stock_df.iloc[i:i+20]
        # 如果窗口中有 NaN，用当前窗口均值填充
        window = window.fillna(window.mean())
        # 构造回归模型的 X, 加上常数项
        X = sm.add_constant(range(1, 21))
        y = window.values

        model = sm.OLS(y, X).fit()
        coef = model.params[1]  # 提取时间序列的系数
        coefficients.append(coef)

    return stock, coefficients

def rolling_time_series_regression(stock_data):
    """
    对 DataFrame 中所有股票的相关系数进行滚动时序回归（无多进程版）。

    :param stock_data: DataFrame，包含股票的相关系数数据。
    :return: 包含回归系数的 DataFrame。
    """
    regression_results = {}
    # 使用 tqdm 观察进度（可选）
    for stock in tqdm(stock_data.columns, desc="Rolling regression"):
        stock_name, coefficients = rolling_regression_for_stock(stock, stock_data)
        regression_results[stock_name] = coefficients

    # 回归结果长度相对于原始数据少了 19 行
    results_df = pd.DataFrame(regression_results, index=stock_data.index[19:])
    return results_df


In [47]:
pv_corr = standard_pv_corr_avg + standard_pv_corr_std

In [54]:
%%time
pv_corr_trend = rolling_time_series_regression(pv_corr)

Rolling regression: 100%|██████████| 4619/4619 [03:14<00:00, 23.71it/s]

CPU times: total: 42 s
Wall time: 3min 14s





In [55]:
long_format_pv_corr_trend = pv_corr_trend.reset_index().melt(id_vars=['date'], var_name='code', value_name='pv_corr_trend')
long_format_pv_corr_trend.tail()

Unnamed: 0,date,code,pv_corr_trend
1034651,2021-12-27,689009.XSHG,-0.03196
1034652,2021-12-28,689009.XSHG,-0.051798
1034653,2021-12-29,689009.XSHG,-0.06819
1034654,2021-12-30,689009.XSHG,-0.081003
1034655,2021-12-31,689009.XSHG,-0.087099


In [58]:
#将日期列转换为datetime类型
df = long_format_pv_corr_trend
df['date']=pd.to_datetime(df['date'])

#保存文件夹路径
output_folder = '../data/factors/CPV/pv_corr_trend'
selected_columns=['code', 'date', 'pv_corr_trend']

os.makedirs(output_folder,exist_ok=True)

#创建日期索引并检查是否在dateframe中存在
date_index=pd.date_range(df['date'].min(),df['date'].max(),freq='D')
existing_dates=[d for d in date_index if d in df['date'].values]

#遍历每个日期如果它在dataframe中存在 则将其保存为单独的csv文件
for date in tqdm(existing_dates):
    group=df.loc[df['date']==date,selected_columns]
    filename=os.path.join(output_folder,f'{date.strftime("%Y-%m-%d")}.csv')
    group=group.sort_values(by='code',ascending=True)
    group = group.drop(columns = ['date'])
    group.to_csv(filename,index=False)

100%|██████████| 224/224 [00:05<00:00, 42.90it/s]


In [59]:
standard_pv_corr_deRet20 = cs_standard(pv_corr_deRet20)
standard_pv_corr_trend = cs_standard(-pv_corr_trend)
cpv = standard_pv_corr_deRet20.iloc[19:] + standard_pv_corr_trend

In [60]:
long_format_cpv = cpv.reset_index().melt(id_vars=['date'], var_name='code', value_name='cpv')
long_format_cpv.tail()

Unnamed: 0,date,code,cpv
1034651,2021-12-27,689009.XSHG,0.937497
1034652,2021-12-28,689009.XSHG,1.224245
1034653,2021-12-29,689009.XSHG,1.679855
1034654,2021-12-30,689009.XSHG,1.824526
1034655,2021-12-31,689009.XSHG,2.126021


In [61]:
#将日期列转换为datetime类型
df = long_format_cpv
df['date']=pd.to_datetime(df['date'])

#保存文件夹路径
output_folder = '../data/factors/CPV/CPV'
selected_columns=['code', 'date', 'cpv']

os.makedirs(output_folder,exist_ok=True)

#创建日期索引并检查是否在dateframe中存在
date_index=pd.date_range(df['date'].min(),df['date'].max(),freq='D')
existing_dates=[d for d in date_index if d in df['date'].values]

#遍历每个日期如果它在dataframe中存在 则将其保存为单独的csv文件
for date in tqdm(existing_dates):
    group=df.loc[df['date']==date,selected_columns]
    filename=os.path.join(output_folder,f'{date.strftime("%Y-%m-%d")}.csv')
    group=group.sort_values(by='code',ascending=True)
    group = group.drop(columns = ['date'])
    group.to_csv(filename,index=False)

100%|██████████| 224/224 [00:05<00:00, 43.70it/s]
