In [1]:
import sys
sys.path.append('/home/wangs/rs/lib')
import ff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import Pool
from tqdm import tqdm

In [2]:
from scipy.stats import pearsonr,spearmanr
from gplearn.genetic import SymbolicRegressor,SymbolicTransformer
from gplearn import fitness
from gplearn.functions import make_function
from gplearn.fitness import make_fitness

In [3]:
feature = pd.read_pickle('/home/wangs/data/check/all_status.pk').loc['20200104':'20230101']
fields = list(feature.columns)
feature = feature.rank()/feature.count()
feature['test_date'] = feature.index
test_date = feature.index

In [4]:
function_set = ['add','sub','mul','div','sqrt','log','inv','abs','neg']

In [5]:
close_ = ff.read('close').loc[:,'20210104':'20230101']
ret = (close_ - close_.shift(axis = 1))/close_.shift(axis = 1)
event = ff.read('Gpalpha014').loc[:,'20210104':'20230101']

In [6]:
rolling_days = [5]
quantile_ratio = [1/5,1/3,1/2,2/3]
stock_num = [10,20,50,100]
paras = [(day,ratio,num) for day in rolling_days for ratio in quantile_ratio for num in stock_num]

In [7]:
import statsmodels.api as sm
best_dict = {}
def open_day_matrix(y,y_pred,w=None):
    best_rate = 0
    best_para = (0,0,0)
    if len(np.unique(y_pred))<=2:
        return 0
    else:
        data=pd.DataFrame({'y_pred':y_pred,'date':test_date})
        ts = pd.Series(data.loc[:,'y_pred'].values,index = data['date'])
        for para in paras:
            rolling_day,ratio,num = para
            open_day = ((ts < ts.rolling(rolling_day,closed = 'left').quantile(ratio)).loc['20210104':'20230101']).replace({True:1.0,False:0})
            if (open_day.sum() == 0):
                return 0
            ret_all = []
            for day in event.columns:
                buy_stock =  (event*ff.filter1.loc[:,'20210104':'20230101'])[day].dropna().sort_values().head(num).index
                ret_all.append(ret.shift(-1,axis = 1).loc[buy_stock][day].mean())
            ret_all[-1] = 0.0
            if ff.cal_returns(ret_all * open_day)['收益回撤比'] > best_rate:
                best_rate = ff.cal_returns(ret_all * open_day)['收益回撤比']
                best_para = para
        best_dict[best_rate] = best_para
        return best_rate

In [8]:
open_day_fc=make_fitness(function=open_day_matrix,greater_is_better=True)

In [9]:
test_gp=SymbolicTransformer(feature_names = fields,
                            function_set = function_set, #所有算子
                            generations = 4, #进化代数
                            population_size = 1000, #种群规模
                            tournament_size = 10, #竞标赛规模
                            p_crossover=0.4,
                            p_subtree_mutation=0.05,
                            p_hoist_mutation=0.01,
                            p_point_mutation=0.03,
                            p_point_replace=0.35,
                            init_depth=(1,4),
                            const_range = None,
                            metric=open_day_fc,
                            parsimony_coefficient = 'auto',
                            low_memory=True,
                            verbose=2,
                            n_jobs = 6,
                           stopping_criteria=30.0)

In [10]:
test_gp.fit(np.nan_to_num(feature.loc[:,fields].values),pd.Series(0,feature.index).values)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed: 601.2min remaining: 1202.3min


   0     4.69          2.12546        3          8.14443              N/A   1993.72m


[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 664.6min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed: 854.5min remaining: 1709.0min


   1     4.52          3.85059        4          9.68625              N/A   1760.09m


[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 880.0min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed: 679.3min remaining: 1358.7min


   2     4.50          5.39173        4          9.68625              N/A    705.48m


[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 705.5min finished
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed: 757.4min remaining: 1514.8min


   3     3.92          6.51836        4          9.68625              N/A      0.00s


[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 776.3min finished


In [11]:
feature_2 = pd.read_pickle('/home/wangs/data/check/all_status.pk').loc['20200104':'20230101']
fields = list(feature_2.columns)
feature_2 = feature_2.rank()/feature_2.count()

In [12]:
for i, program in enumerate(test_gp._best_programs):
    print(f'Program {i+1}: {program}{program.raw_fitness_}')
    rolling_day,ratio,num = best_dict[program.raw_fitness_]
    transform_X = program.execute(np.nan_to_num(feature_2.loc[:,fields].values))
    ts = pd.Series(transform_X,ff.read('Gpalpha014').loc[:,'20200104':'20230101'].columns)
    open_day = ((ts < ts.rolling(rolling_day,closed = 'left').quantile(ratio)).loc['20210104':'20230101']).replace({True:1.0,False:0})
    ret_all = []
    for day in event.columns:
        buy_stock =  (event*ff.filter1.loc[:,'20210104':'20230101'])[day].dropna().sort_values().head(num).index
        ret_all.append(ret.shift(-1,axis = 1).loc[buy_stock][day].mean())
    ret_all[-1] = 0.0
    if i < 3:
        (ret_all * open_day.loc['20210104':'20230101']).cumsum().plot()

Program 1: div(log(low60_rate), high60_rate)9.686249800199167


KeyError: 9.686249800199167

In [None]:
feature.loc[:,fields]

In [None]:
feature = pd.read_pickle('/home/wangs/data/check/all_status.pk')
data = feature['PB'].rank()/feature['PB'].sum() + feature['PE'].rank()/feature['PE'].sum()

In [None]:
data.loc['20200101':'20210301'] > data.loc['20200101':'20210101'].quantile(0.3)

In [None]:
data.loc['20200101':'20210101'].quantile(0.7)

In [None]:
data.plot()

In [None]:
((data > data.rolling(5,closed = 'left').quantile(2/3)).loc['20210104':'20240101']).replace({True:1.0,False:0})

In [None]:
rolling_day = [5,10,20,50,100]

In [None]:
test_day_index = data.loc['20210104':'20240101'].index

In [None]:
data > data.rolling(30).quantile(0.7)

In [None]:
feature['ILQ_type_raw'].quantile(1/5)