In [56]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

#加载数据
def load_pkl(pkl_path):
    df = pd.read_pickle(pkl_path)
    df.index=pd.to_datetime(df.index)
    return df

def load_exclusion(pkl_path):
    exclusion_df=pd.read_pickle(pkl_path)
    new_dates=pd.to_datetime(exclusion_df.index.get_level_values('date'))
    new_assets=exclusion_df.index.get_level_values('asset').astype(str)
    exclusion_df.index = pd.MultiIndex.from_arrays([new_dates, new_assets], names=['date', 'asset'])
    return exclusion_df

#计算估值中枢稳定性因子
def calculate_stability_factor(base_df):
    rolling_mean=base_df.rolling(window=36,min_periods=12).mean()
    rolling_std=base_df.rolling(window=36,min_periods=12).std().replace(0, np.nan)
    stability_factor=rolling_mean/rolling_std
    return stability_factor

#由于没有找到经营活动产生的现金流量净额与企业价值的数据，此处只能用净现金流和市值替代计算这个因子。
def calculate_cfmv(cf_df,mcap_df):
    aligned_idx=cf_df.index.intersection(mcap_df.index)
    aligned_col=cf_df.columns.intersection(mcap_df.columns)
    cf_aligned=cf_df.loc[aligned_idx,aligned_col]
    mcap_aligned=mcap_df.loc[aligned_idx,aligned_col]
    cfmv_df=pd.DataFrame(np.nan,index=aligned_idx,columns=aligned_col)
    valid_mask_cf=cf_aligned>0
    valid_mask_mcap=mcap_aligned>1e-6
    combined_mask=valid_mask_cf&valid_mask_mcap
    cfmv_df[combined_mask]=cf_aligned[combined_mask]/mcap_aligned[combined_mask]
    return cfmv_df

#数据预处理
def data_process(factor,returns):
    df=pd.concat([factor.rename('factor'),returns.rename('return')],axis=1)
    df.dropna(inplace=True)
    if df.empty:
        return df
    df['factor']=winsorize(df['factor'],limits=[0.01, 0.01])
    return df

#数据分析：分组收益
def quantile_return(df):
    if df.empty or 'factor' not in df.columns or df['factor'].dropna().empty:
        return pd.Series([np.nan] * 10, index=pd.RangeIndex(start=1, stop=11, name='quantile'))
    try:
        df_copy = df.copy()
        df_copy.loc[:, 'quantile'] = pd.qcut(df_copy['factor'], 10, labels=False, duplicates='drop') + 1
        quantile_returns = df_copy.groupby('quantile')['return'].mean()
        return quantile_returns.reindex(pd.RangeIndex(start=1, stop=11, name='quantile'))
    except:
        return pd.Series([np.nan] * 10, index=pd.RangeIndex(start=1, stop=11, name='quantile'))

def quantile_return_series(factor,returns,blacklist,untradable):
    factor.columns=factor.columns.astype(str)
    returns.columns=returns.columns.astype(str)
    common_dates=factor.index.intersection(returns.index)
    all_dates=sorted(list(common_dates))
    quantile_returns_list=[]
    dates_quantile_series=[]
    for date in all_dates:
        factor_at_date_raw=factor.loc[date]
        returns_at_date_raw=returns.loc[date]
        initial_available_assets=factor_at_date_raw.dropna().index.intersection(returns_at_date_raw.dropna().index)
        excluded_today=set()
        if date<pd.to_datetime("2023-01-01"):
            if date in blacklist.index.get_level_values('date'):
                excluded_today.update(blacklist.xs(date,level='date').index.get_level_values('asset'))
            if date in untradable.index.get_level_values('date'):
                excluded_today.update(untradable.xs(date,level='date').index.get_level_values('asset'))
        assets_period=[asset for asset in initial_available_assets if asset not in excluded_today]
        factor_series=factor_at_date_raw.loc[assets_period].dropna()
        returns_series=returns_at_date_raw.loc[assets_period].dropna()
        final_common_assets= factor_series.index.intersection(returns_series.index)
        factor_series=factor_series.loc[final_common_assets]
        returns_series=returns_series.loc[final_common_assets]
        if factor_series.empty or returns_series.empty:
            continue
        data_single_period=data_process(factor_series, returns_series)
        q_returns=quantile_return(data_single_period)
        quantile_returns_list.append(q_returns)
        dates_quantile_series.append(date)
    if not quantile_returns_list:
        empty_idx = pd.to_datetime([]) # 创建一个空的 DatetimeIndex
        return pd.DataFrame(index=empty_idx, columns=[f'Q{i}' for i in range(1, 11)])
    returns_df=pd.concat(quantile_returns_list,axis=1).T
    returns_df.index=pd.to_datetime(dates_quantile_series)
    returns_df.columns=[f'Q{i}' for i in range(1, 11)]
    return returns_df

#数据分析：组合绩效
def portfolio_performance(returns_df):
    if not isinstance(returns_df, pd.Series):
     returns_df = pd.Series(returns_df)
    if returns_df.empty or returns_df.isnull().all():
        return pd.Series({
            'Annual Return': np.nan, 'Annual Std': np.nan,
            'Sharpe Ratio': np.nan, 'Max Drawdown': np.nan
        })
    if len(returns_df.dropna()) < 2: #
        return pd.Series({
            'Annual Return': np.nan, 'Annual Std': np.nan,
            'Sharpe Ratio': np.nan, 'Max Drawdown': np.nan
        })
    nav=(1+returns_df).cumprod()
    total_return=nav.iloc[-1]/nav.iloc[0]-1
    num_years=len(returns_df)/12
    annual_return=(1+total_return)**(1/num_years)-1
    annual_std=returns_df.std()* np.sqrt(12)
    sharpe_ratio=(annual_return)/annual_std
    peak=nav.cummax()
    drawdown=(nav-peak)/peak
    max_drawdown=drawdown.min()
    return pd.Series({
        'Annual Return': annual_return, 'Annual Std': annual_std,
        'Sharpe Ratio': sharpe_ratio, 'Max Drawdown': max_drawdown
    })

In [57]:
start_date=pd.to_datetime("2010-01-01")
end_date=pd.to_datetime("2023-01-01")

bp_df=load_pkl("data/book_value_to_total_mktcap_mrq.pkl")
ep_df=load_pkl("data/net_profit_to_total_mktcap_ttm.pkl")
cf_df=load_pkl("data/net_cash_flow_to_total_mktcap_ttm.pkl")
mcap_df=load_pkl("data/mcap.pkl")
returns_df=load_pkl("data/monthly_returns.pkl")

black_list=load_exclusion("data/BLACKLIST.pkl")
untradable=load_exclusion("data/UNTRADABLE.pkl")

bp_stability_factor_df=calculate_stability_factor(bp_df)
ep_df[ep_df<=0]=np.nan
ep_stability_factor_df=calculate_stability_factor(ep_df)
cfmv_df=calculate_cfmv(cf_df,mcap_df)
cfmv_stability_factor_df=calculate_stability_factor(cfmv_df)

#样本内
factors_is={
    'BP_Stability': bp_stability_factor_df[(bp_stability_factor_df.index>=start_date)&(bp_stability_factor_df.index<end_date)].copy(),
    'EP_Stability': ep_stability_factor_df[(ep_stability_factor_df.index>=start_date)&(ep_stability_factor_df.index<end_date)].copy(),
    'CFMV_Stability': cfmv_stability_factor_df[(cfmv_stability_factor_df.index>=start_date)&(cfmv_stability_factor_df.index<end_date)].copy()
}

stacked_factors_for_corr=[df.stack().rename(name) for name,df in factors_is.items()]
aligned_factors_for_corr_df=pd.concat(stacked_factors_for_corr,axis=1).dropna()
correlation_matrix=aligned_factors_for_corr_df.corr(method='spearman')
print(correlation_matrix.to_string())


                BP_Stability  EP_Stability  CFMV_Stability
BP_Stability          1.0000        0.3264          0.3961
EP_Stability          0.3264        1.0000          0.3235
CFMV_Stability        0.3961        0.3235          1.0000


In [58]:
# 定义样本外的时间范围
start_date_oos = pd.to_datetime("2023-01-01")
end_date_oos = pd.to_datetime("2024-08-01")

factors_oos = {
    'BP_Stability': bp_stability_factor_df[(bp_stability_factor_df.index >= start_date_oos) & (bp_stability_factor_df.index <= end_date_oos)].copy(),
    'EP_Stability': ep_stability_factor_df[(ep_stability_factor_df.index >= start_date_oos) & (ep_stability_factor_df.index <= end_date_oos)].copy(),
    'CFMV_Stability': cfmv_stability_factor_df[(cfmv_stability_factor_df.index >= start_date_oos) & (cfmv_stability_factor_df.index <= end_date_oos)].copy()
}

returns_oos = returns_df[(returns_df.index >= start_date_oos) & (returns_df.index <= end_date_oos)].copy()

quantile_returns_oos = {}
for factor_name, factor_df_oos in factors_oos.items():
    quantile_returns_df = quantile_return_series(
        factor=factor_df_oos,
        returns=returns_oos,
        blacklist=black_list,
        untradable=untradable
    )
    quantile_returns_oos[factor_name] = quantile_returns_df

ls_performance_results_list = []
# 遍历每个因子计算其多空组合表现
for factor_name, quantile_returns_df in quantile_returns_oos.items():
    q10_returns = quantile_returns_df['Q10']
    q1_returns = quantile_returns_df['Q1']
    long_short_returns = (q10_returns - q1_returns).dropna() # 移除结果中的NaN值
    performance_data = portfolio_performance(long_short_returns)
    performance_data.name = factor_name
    ls_performance_results_list.append(performance_data)

print("\n--- 各因子多空组合样本外表现汇总表 ---")
pd.options.display.float_format = '{:.4f}'.format
final_ls_performance_df = pd.concat(ls_performance_results_list, axis=1).T
print(final_ls_performance_df.to_string())


  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(



--- 各因子多空组合样本外表现汇总表 ---
                Annual Return  Annual Std  Sharpe Ratio  Max Drawdown
BP_Stability           0.3133      0.1071        2.9255       -0.0621
EP_Stability           0.2037      0.0867        2.3506       -0.0226
CFMV_Stability         0.2642      0.0965        2.7375       -0.0372


  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
