In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy import stats

假设以下数据框架：df_stocks（包含股票代码、日期、收益率、因子值、市值、行业）  
示例数据结构：['stock_id', 'date', 'return', 'factor', 'market_value', 'industry']

In [2]:
# 测试数据

np.random.seed(0)  

dates = pd.date_range(start="2020-01-01", end="2020-12-31", freq='D')
data = {
    'stock_id': np.tile(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], len(dates)),
    'date': np.repeat(dates, 10),
    'return': np.random.rand(len(dates) * 10) - 0.5,
    'factor': np.random.rand(len(dates) * 10),
    'market_value': np.random.rand(len(dates) * 10) * 10000,
    'industry': np.tile(['Tech', 'Tech', 'Finance', 'Finance', 'Energy', 'Energy', 'Tech', 'Finance', 'Energy', 'Tech'], len(dates))
}
df = pd.DataFrame(data)
df.head(30)

Unnamed: 0,stock_id,date,return,factor,market_value,industry
0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech
1,B,2020-01-01,0.215189,0.904397,1255.356503,Tech
2,C,2020-01-01,0.102763,0.294879,4878.892942,Finance
3,D,2020-01-01,0.044883,0.308864,6424.258071,Finance
4,E,2020-01-01,-0.076345,0.627183,5696.041114,Energy
5,F,2020-01-01,0.145894,0.505192,8894.943101,Energy
6,G,2020-01-01,-0.062413,0.195869,6840.334557,Tech
7,H,2020-01-01,0.391773,0.144727,8250.33131,Finance
8,I,2020-01-01,0.463663,0.489497,6621.259107,Energy
9,J,2020-01-01,-0.116558,0.779391,3442.434722,Tech


### 标准化、缩尾极值处理

In [3]:
#scale: 范围乘数
def winsorize(group,scale=5):

    
    # 计算中位数
    median = group['factor'].median()
    # 计算每个值与中位数的绝对差值
    mad = (group['factor'] - median).abs().median()
    
    # 定义去极值的边界
    lower_bound = median - scale * mad
    upper_bound = median + scale * mad
    # 应用去极值的边界
    group['factor'] = group['factor'].apply(lambda x: min(max(x, lower_bound), upper_bound))
    
    return group

In [4]:
def standardize(group):
    # 计算均值和标准差
    mean = group['factor'].mean()
    std = group['factor'].std()
    
#     # 避免除以零的情况，如果标准差为零则返回原值
#     if std == 0:
#         return group
    # 标准化
    group['factor'] = (group['factor'] - mean) / std
    return group

In [5]:
df.groupby('date')['factor'].std()

date
2020-01-01    0.247641
2020-01-02    0.302185
2020-01-03    0.331095
2020-01-04    0.326619
2020-01-05    0.217765
                ...   
2020-12-27    0.251559
2020-12-28    0.342623
2020-12-29    0.320186
2020-12-30    0.234967
2020-12-31    0.233585
Name: factor, Length: 366, dtype: float64

In [6]:
df.groupby('date')['factor'].head()

0       0.535903
1       0.904397
2       0.294879
3       0.308864
4       0.627183
          ...   
3650    0.643393
3651    0.306043
3652    0.864741
3653    0.593604
3654    0.388446
Name: factor, Length: 1830, dtype: float64

In [7]:
df_grouped = df.groupby('date').apply(winsorize)
df_grouped.head(30)




Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,date,return,factor,market_value,industry
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech
2020-01-01,1,B,2020-01-01,0.215189,0.904397,1255.356503,Tech
2020-01-01,2,C,2020-01-01,0.102763,0.294879,4878.892942,Finance
2020-01-01,3,D,2020-01-01,0.044883,0.308864,6424.258071,Finance
2020-01-01,4,E,2020-01-01,-0.076345,0.627183,5696.041114,Energy
2020-01-01,5,F,2020-01-01,0.145894,0.505192,8894.943101,Energy
2020-01-01,6,G,2020-01-01,-0.062413,0.195869,6840.334557,Tech
2020-01-01,7,H,2020-01-01,0.391773,0.144727,8250.33131,Finance
2020-01-01,8,I,2020-01-01,0.463663,0.489497,6621.259107,Energy
2020-01-01,9,J,2020-01-01,-0.116558,0.779391,3442.434722,Tech


In [8]:
df_standardized = df_grouped.reset_index(drop=True).groupby('date').apply(standardize)
df_standardized.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,date,return,factor,market_value,industry
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01,0,A,2020-01-01,0.048814,0.231436,9156.60849,Tech
2020-01-01,1,B,2020-01-01,0.215189,1.719449,1255.356503,Tech
2020-01-01,2,C,2020-01-01,0.102763,-0.741843,4878.892942,Finance
2020-01-01,3,D,2020-01-01,0.044883,-0.685371,6424.258071,Finance
2020-01-01,4,E,2020-01-01,-0.076345,0.600033,5696.041114,Energy
2020-01-01,5,F,2020-01-01,0.145894,0.107418,8894.943101,Energy
2020-01-01,6,G,2020-01-01,-0.062413,-1.141656,6840.334557,Tech
2020-01-01,7,H,2020-01-01,0.391773,-1.348171,8250.33131,Finance
2020-01-01,8,I,2020-01-01,0.463663,0.044043,6621.259107,Energy
2020-01-01,9,J,2020-01-01,-0.116558,1.214662,3442.434722,Tech


### 行业中性和市值中性处理

In [9]:
#输出：中性化后的因子值series
def neutralize(group):
    
    # 取对数市值
    ln_market_value = np.log(group['market_value'])
    # 获取行业哑变量
    industry_dummies = pd.get_dummies(group['industry'])
    # 构建自变量X，包括市值对数和行业哑变量
    X = sm.add_constant(pd.concat([ln_market_value, industry_dummies], axis=1))
    # 回归分析
    model = sm.OLS(group['factor'].astype(float), X.astype(float)).fit()
    # 返回残差
    group['factor_n'] = model.resid
    return group

In [10]:
df_neutralize = df_standardized.reset_index(drop=True).groupby('date').apply(neutralize)
df_neutralize.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,date,return,factor,market_value,industry,factor_n
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-01,0,A,2020-01-01,0.048814,0.231436,9156.60849,Tech,0.633065
2020-01-01,1,B,2020-01-01,0.215189,1.719449,1255.356503,Tech,-0.093795
2020-01-01,2,C,2020-01-01,0.102763,-0.741843,4878.892942,Finance,-0.11414
2020-01-01,3,D,2020-01-01,0.044883,-0.685371,6424.258071,Finance,0.249043
2020-01-01,4,E,2020-01-01,-0.076345,0.600033,5696.041114,Energy,0.128007
2020-01-01,5,F,2020-01-01,0.145894,0.107418,8894.943101,Energy,0.132205
2020-01-01,6,G,2020-01-01,-0.062413,-1.141656,6840.334557,Tech,-1.065102
2020-01-01,7,H,2020-01-01,0.391773,-1.348171,8250.33131,Finance,-0.134903
2020-01-01,8,I,2020-01-01,0.463663,0.044043,6621.259107,Energy,-0.260212
2020-01-01,9,J,2020-01-01,-0.116558,1.214662,3442.434722,Tech,0.525832


### 多空组合分组回测

每月rebalance

In [11]:
group_num = 5
interval = 20 #假设20天的数据为1月

In [12]:
# 将日期设为索引，并按月重新采样
df_neutralize.set_index('date', inplace=True)
df_neutralize.head()

Unnamed: 0_level_0,stock_id,return,factor,market_value,industry,factor_n
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,A,0.048814,0.231436,9156.60849,Tech,0.633065
2020-01-01,B,0.215189,1.719449,1255.356503,Tech,-0.093795
2020-01-01,C,0.102763,-0.741843,4878.892942,Finance,-0.11414
2020-01-01,D,0.044883,-0.685371,6424.258071,Finance,0.249043
2020-01-01,E,-0.076345,0.600033,5696.041114,Energy,0.128007


In [13]:
# 按月末因子值和市值分组, 这里设置20天为1月，进行月度调仓
df_monthly = df_neutralize.groupby('stock_id').resample(str(interval)+'d').first()

In [14]:
df_monthly

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,2020-01-01,A,0.048814,0.231436,9156.608490,Tech,0.633065
A,2020-01-21,A,-0.188204,0.222321,4284.011695,Tech,0.050511
A,2020-02-10,A,-0.098740,-0.575279,6196.886446,Tech,-0.500237
A,2020-03-01,A,-0.325342,0.173809,426.656544,Tech,0.271094
A,2020-03-21,A,-0.460007,-0.610862,9193.473437,Tech,-0.537282
...,...,...,...,...,...,...,...
J,2020-10-07,J,-0.202415,1.344008,8670.986826,Tech,0.953131
J,2020-10-27,J,0.122927,0.614455,8033.251085,Tech,0.947205
J,2020-11-16,J,-0.225405,0.106524,8443.880346,Tech,0.205651
J,2020-12-06,J,-0.194722,-0.720655,7650.525317,Tech,-0.153462


In [15]:
#按中性化后的因子值排序分组
df_monthly['quantile'] = df_monthly.groupby('date')['factor_n'].transform(lambda x: pd.qcut(x, group_num, labels=False, duplicates='drop'))

In [16]:
df_monthly.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n,quantile
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2020-01-01,A,0.048814,0.231436,9156.60849,Tech,0.633065,4
A,2020-01-21,A,-0.188204,0.222321,4284.011695,Tech,0.050511,2
A,2020-02-10,A,-0.09874,-0.575279,6196.886446,Tech,-0.500237,1
A,2020-03-01,A,-0.325342,0.173809,426.656544,Tech,0.271094,2
A,2020-03-21,A,-0.460007,-0.610862,9193.473437,Tech,-0.537282,1
A,2020-04-10,A,0.09288,-1.158827,5624.416131,Tech,-0.45044,1
A,2020-04-30,A,-0.191472,0.482469,2829.793171,Tech,-0.316896,1
A,2020-05-20,A,-0.186409,0.581532,6963.224087,Tech,0.676783,3
A,2020-06-09,A,0.419171,0.846104,9006.777412,Tech,-0.217467,1
A,2020-06-29,A,0.04695,0.003937,1459.527948,Tech,0.419389,3


In [17]:
# 筛选最高组和最低组
df_high = df_monthly[df_monthly['quantile'] == group_num - 1]  # 最高组
df_low = df_monthly[df_monthly['quantile'] == 0]  # 最低组

In [18]:
df_high.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n,quantile
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2020-01-01,A,0.048814,0.231436,9156.60849,Tech,0.633065,4
A,2020-07-19,A,0.311518,0.957322,9695.786738,Tech,1.189879,4
A,2020-09-17,A,-0.354052,1.307748,5894.545836,Tech,1.229533,4
A,2020-10-27,A,-0.086038,0.100592,2574.445784,Tech,0.785103,4
A,2020-12-06,A,-0.004025,0.945308,2999.053572,Tech,1.612502,4
C,2020-04-10,C,-0.024174,1.51769,7536.367849,Finance,1.145244,4
C,2020-04-30,C,0.388265,0.473789,4939.689221,Finance,0.905085,4
C,2020-05-20,C,-0.298733,2.186083,4938.824657,Finance,1.196093,4
D,2020-08-08,D,0.137969,1.645363,3886.714264,Finance,1.301648,4
D,2020-10-07,D,-0.313696,1.334767,7919.904419,Finance,1.131453,4


In [19]:
df_low.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n,quantile
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2020-08-08,A,-0.397928,-1.078203,1821.037975,Tech,-1.108304,0
B,2020-03-21,B,0.139705,-1.491785,2421.201532,Tech,-1.365941,0
B,2020-06-29,B,-0.206383,-1.132321,4652.939637,Tech,-1.218727,0
B,2020-09-17,B,0.132766,-1.208906,9196.162961,Tech,-1.399683,0
B,2020-11-16,B,-0.111901,-1.68804,5996.942982,Tech,-1.715414,0
B,2020-12-06,B,0.475846,-1.748247,4659.537627,Tech,-1.128104,0
C,2020-07-19,C,0.023156,-1.225148,7652.020695,Finance,-0.769473,0
C,2020-08-08,C,-0.160349,-0.532045,6394.486915,Finance,-1.066234,0
C,2020-10-07,C,-0.494948,-1.009539,9713.470717,Finance,-1.198678,0
C,2020-11-16,C,0.295336,-1.38349,5804.126832,Finance,-1.494936,0


In [20]:
# 市值加权 - 以月末数据计算权重
df_high1 = df_high.copy()
df_low1 = df_low.copy()
df_high1['weight'] = df_high['market_value'] / df_high.groupby('date')['market_value'].transform('sum')
df_low1['weight'] = df_low['market_value'] / df_low.groupby('date')['market_value'].transform('sum')

In [21]:
df_high = df_high1.copy()
df_high.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n,quantile,weight
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,2020-01-01,A,0.048814,0.231436,9156.60849,Tech,0.633065,4,0.72677
A,2020-07-19,A,0.311518,0.957322,9695.786738,Tech,1.189879,4,0.580454
A,2020-09-17,A,-0.354052,1.307748,5894.545836,Tech,1.229533,4,0.495794
A,2020-10-27,A,-0.086038,0.100592,2574.445784,Tech,0.785103,4,0.242696
A,2020-12-06,A,-0.004025,0.945308,2999.053572,Tech,1.612502,4,0.233041
C,2020-04-10,C,-0.024174,1.51769,7536.367849,Finance,1.145244,4,0.499535
C,2020-04-30,C,0.388265,0.473789,4939.689221,Finance,0.905085,4,0.655352
C,2020-05-20,C,-0.298733,2.186083,4938.824657,Finance,1.196093,4,0.445787
D,2020-08-08,D,0.137969,1.645363,3886.714264,Finance,1.301648,4,0.620447
D,2020-10-07,D,-0.313696,1.334767,7919.904419,Finance,1.131453,4,0.477365


In [22]:
df_low = df_low1.copy()
df_low.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,stock_id,return,factor,market_value,industry,factor_n,quantile,weight
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,2020-08-08,A,-0.397928,-1.078203,1821.037975,Tech,-1.108304,0,0.221658
B,2020-03-21,B,0.139705,-1.491785,2421.201532,Tech,-1.365941,0,0.2447
B,2020-06-29,B,-0.206383,-1.132321,4652.939637,Tech,-1.218727,0,0.322402
B,2020-09-17,B,0.132766,-1.208906,9196.162961,Tech,-1.399683,0,0.642548
B,2020-11-16,B,-0.111901,-1.68804,5996.942982,Tech,-1.715414,0,0.508169
B,2020-12-06,B,0.475846,-1.748247,4659.537627,Tech,-1.128104,0,0.614731
C,2020-07-19,C,0.023156,-1.225148,7652.020695,Finance,-0.769473,0,0.641472
C,2020-08-08,C,-0.160349,-0.532045,6394.486915,Finance,-1.066234,0,0.778342
C,2020-10-07,C,-0.494948,-1.009539,9713.470717,Finance,-1.198678,0,0.577688
C,2020-11-16,C,0.295336,-1.38349,5804.126832,Finance,-1.494936,0,0.491831


In [23]:
# 设置用来merge的df
df_t = df_high.drop(['stock_id'],axis=1).reset_index()
df_b = df_low.drop(['stock_id'],axis=1).reset_index()
df_merge = df.reset_index()

In [24]:
df_t.head(30)

Unnamed: 0,stock_id,date,return,factor,market_value,industry,factor_n,quantile,weight
0,A,2020-01-01,0.048814,0.231436,9156.60849,Tech,0.633065,4,0.72677
1,A,2020-07-19,0.311518,0.957322,9695.786738,Tech,1.189879,4,0.580454
2,A,2020-09-17,-0.354052,1.307748,5894.545836,Tech,1.229533,4,0.495794
3,A,2020-10-27,-0.086038,0.100592,2574.445784,Tech,0.785103,4,0.242696
4,A,2020-12-06,-0.004025,0.945308,2999.053572,Tech,1.612502,4,0.233041
5,C,2020-04-10,-0.024174,1.51769,7536.367849,Finance,1.145244,4,0.499535
6,C,2020-04-30,0.388265,0.473789,4939.689221,Finance,0.905085,4,0.655352
7,C,2020-05-20,-0.298733,2.186083,4938.824657,Finance,1.196093,4,0.445787
8,D,2020-08-08,0.137969,1.645363,3886.714264,Finance,1.301648,4,0.620447
9,D,2020-10-07,-0.313696,1.334767,7919.904419,Finance,1.131453,4,0.477365


In [25]:
df_b.head(30)

Unnamed: 0,stock_id,date,return,factor,market_value,industry,factor_n,quantile,weight
0,A,2020-08-08,-0.397928,-1.078203,1821.037975,Tech,-1.108304,0,0.221658
1,B,2020-03-21,0.139705,-1.491785,2421.201532,Tech,-1.365941,0,0.2447
2,B,2020-06-29,-0.206383,-1.132321,4652.939637,Tech,-1.218727,0,0.322402
3,B,2020-09-17,0.132766,-1.208906,9196.162961,Tech,-1.399683,0,0.642548
4,B,2020-11-16,-0.111901,-1.68804,5996.942982,Tech,-1.715414,0,0.508169
5,B,2020-12-06,0.475846,-1.748247,4659.537627,Tech,-1.128104,0,0.614731
6,C,2020-07-19,0.023156,-1.225148,7652.020695,Finance,-0.769473,0,0.641472
7,C,2020-08-08,-0.160349,-0.532045,6394.486915,Finance,-1.066234,0,0.778342
8,C,2020-10-07,-0.494948,-1.009539,9713.470717,Finance,-1.198678,0,0.577688
9,C,2020-11-16,0.295336,-1.38349,5804.126832,Finance,-1.494936,0,0.491831


In [26]:
df_daily = pd.merge(df_merge, df_t[['date','stock_id', 'weight']], on=['date', 'stock_id'], how='left')
df_daily.head(30)

Unnamed: 0,index,stock_id,date,return,factor,market_value,industry,weight
0,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech,0.72677
1,1,B,2020-01-01,0.215189,0.904397,1255.356503,Tech,
2,2,C,2020-01-01,0.102763,0.294879,4878.892942,Finance,
3,3,D,2020-01-01,0.044883,0.308864,6424.258071,Finance,
4,4,E,2020-01-01,-0.076345,0.627183,5696.041114,Energy,
5,5,F,2020-01-01,0.145894,0.505192,8894.943101,Energy,
6,6,G,2020-01-01,-0.062413,0.195869,6840.334557,Tech,
7,7,H,2020-01-01,0.391773,0.144727,8250.33131,Finance,
8,8,I,2020-01-01,0.463663,0.489497,6621.259107,Energy,
9,9,J,2020-01-01,-0.116558,0.779391,3442.434722,Tech,0.27323


In [27]:
df_daily = pd.merge(df_daily, df_b[['date','stock_id', 'weight']], on=['date', 'stock_id'], how='left')
df_daily.head(30)


Unnamed: 0,index,stock_id,date,return,factor,market_value,industry,weight_x,weight_y
0,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech,0.72677,
1,1,B,2020-01-01,0.215189,0.904397,1255.356503,Tech,,
2,2,C,2020-01-01,0.102763,0.294879,4878.892942,Finance,,
3,3,D,2020-01-01,0.044883,0.308864,6424.258071,Finance,,
4,4,E,2020-01-01,-0.076345,0.627183,5696.041114,Energy,,
5,5,F,2020-01-01,0.145894,0.505192,8894.943101,Energy,,
6,6,G,2020-01-01,-0.062413,0.195869,6840.334557,Tech,,0.508137
7,7,H,2020-01-01,0.391773,0.144727,8250.33131,Finance,,
8,8,I,2020-01-01,0.463663,0.489497,6621.259107,Energy,,0.491863
9,9,J,2020-01-01,-0.116558,0.779391,3442.434722,Tech,0.27323,


In [28]:

def fill_weight(group,posit):
    
    indices = group.index[group['weight'+posit].notna()]
    
        
    for idx in indices:
        end_idx = min(idx + 20-1, group.index[-1] + 1)
        group.loc[idx:end_idx, 'weight'+posit] = group.loc[idx, 'weight'+posit]
    return group

In [29]:
df_reset_index = df_daily.groupby('stock_id').apply(lambda x: x.reset_index(drop=True)).reset_index(drop=True)

In [30]:
df_grouped = df_reset_index.groupby('stock_id', group_keys=True).apply(fill_weight,posit='_x')
#df_grouped.head(30)
#a = df_grouped[df_daily['date'] == '2020-01-21']
a = df_grouped[df_grouped['stock_id'] == 'A']
a.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,stock_id,date,return,factor,market_value,industry,weight_x,weight_y
stock_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech,0.72677,
A,1,10,A,2020-01-02,0.291725,0.778831,2725.837965,Tech,0.72677,
A,2,20,A,2020-01-03,0.478618,0.029114,3209.617843,Tech,0.72677,
A,3,30,A,2020-01-04,-0.235444,0.24843,3132.872291,Tech,0.72677,
A,4,40,A,2020-01-05,-0.140492,0.577297,5107.033625,Tech,0.72677,
A,5,50,A,2020-01-06,0.070197,0.533428,6357.439834,Tech,0.72677,
A,6,60,A,2020-01-07,-0.34103,0.093384,2370.408602,Tech,0.72677,
A,7,70,A,2020-01-08,0.476459,0.179648,5675.669612,Tech,0.72677,
A,8,80,A,2020-01-09,-0.182017,0.032673,5631.81241,Tech,0.72677,
A,9,90,A,2020-01-10,-0.181431,0.890631,7359.590198,Tech,0.72677,


In [31]:
df_grouped_low = df_reset_index.groupby('stock_id',group_keys=True).apply(fill_weight,posit='_y')
df_grouped_low.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,stock_id,date,return,factor,market_value,industry,weight_x,weight_y
stock_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech,0.72677,
A,1,10,A,2020-01-02,0.291725,0.778831,2725.837965,Tech,,
A,2,20,A,2020-01-03,0.478618,0.029114,3209.617843,Tech,,
A,3,30,A,2020-01-04,-0.235444,0.24843,3132.872291,Tech,,
A,4,40,A,2020-01-05,-0.140492,0.577297,5107.033625,Tech,,
A,5,50,A,2020-01-06,0.070197,0.533428,6357.439834,Tech,,
A,6,60,A,2020-01-07,-0.34103,0.093384,2370.408602,Tech,,
A,7,70,A,2020-01-08,0.476459,0.179648,5675.669612,Tech,,
A,8,80,A,2020-01-09,-0.182017,0.032673,5631.81241,Tech,,
A,9,90,A,2020-01-10,-0.181431,0.890631,7359.590198,Tech,,


In [32]:
df_grouped['weight_y'] = df_grouped_low['weight_y']

In [33]:
df_grouped[['weight_x','weight_y']] = df_grouped[['weight_x','weight_y']].fillna(0)
df_grouped.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,stock_id,date,return,factor,market_value,industry,weight_x,weight_y
stock_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0,0,A,2020-01-01,0.048814,0.535903,9156.60849,Tech,0.72677,0.0
A,1,10,A,2020-01-02,0.291725,0.778831,2725.837965,Tech,0.72677,0.0
A,2,20,A,2020-01-03,0.478618,0.029114,3209.617843,Tech,0.72677,0.0
A,3,30,A,2020-01-04,-0.235444,0.24843,3132.872291,Tech,0.72677,0.0
A,4,40,A,2020-01-05,-0.140492,0.577297,5107.033625,Tech,0.72677,0.0
A,5,50,A,2020-01-06,0.070197,0.533428,6357.439834,Tech,0.72677,0.0
A,6,60,A,2020-01-07,-0.34103,0.093384,2370.408602,Tech,0.72677,0.0
A,7,70,A,2020-01-08,0.476459,0.179648,5675.669612,Tech,0.72677,0.0
A,8,80,A,2020-01-09,-0.182017,0.032673,5631.81241,Tech,0.72677,0.0
A,9,90,A,2020-01-10,-0.181431,0.890631,7359.590198,Tech,0.72677,0.0


In [34]:
# 按日期分组，并计算每个日期的收益
daily_returns = df_grouped.groupby('date').apply(
    lambda x: (x['return'] * (x['weight_x'] - x['weight_y'])).sum()
).reset_index(name='total_return')

# 重命名列
daily_returns.columns = ['date', 'daily_total_return']

# 显示每日总收益序列
daily_returns.head(30)

Unnamed: 0,date,daily_total_return
0,2020-01-01,-0.192715
1,2020-01-02,0.420095
2,2020-01-03,0.495008
3,2020-01-04,-0.396658
4,2020-01-05,0.098606
5,2020-01-06,-0.080043
6,2020-01-07,-0.68754
7,2020-01-08,0.452736
8,2020-01-09,-0.064156
9,2020-01-10,-0.472944


In [35]:
df_grouped.groupby('date')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BD11775690>

### 指标计算

In [36]:
#多空组合的指标

annualized_return = np.mean(daily_returns['daily_total_return']) * 252
annualized_vol = np.std(daily_returns['daily_total_return']) * np.sqrt(252)
sharpe_ratio = annualized_return / annualized_vol

In [37]:
annualized_return

-0.10395290007331347

In [38]:
annualized_vol

4.980086513387699

In [39]:
sharpe_ratio

-0.02087371369831879