In [1]:
print('Hello')

Hello


## 采集数据 

In [2]:
!pip install efinance
!pip install scikit-learn

Collecting efinance
  Using cached efinance-0.5.2-py3-none-any.whl.metadata (41 kB)
Collecting rich (from efinance)
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting jsonpath (from efinance)
  Using cached jsonpath-0.82.2-py3-none-any.whl
Collecting pandas (from efinance)
  Downloading pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting tqdm (from efinance)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting retry (from efinance)
  Using cached retry-0.9.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking (from efinance)
  Using cached multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting bs4 (from efinance)
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting tzdata>=2022.7 (from pandas->efinance)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting py<2.0.0,>=1.4.26 (from retry->efinance)
  Using cached py-1.11.0-py2.py3-none-any.whl.metadata (2.8 kB

In [12]:
from typing import Callable

import efinance as ef
import pandas as pd
import sklearn.metrics

pd.set_option('display.max_columns', None)


def collect_future_day_history(deal_func: Callable):
    # 1.收集行情ID
    base_info = ef.futures.get_futures_base_info()
    cod_list = list(
        base_info[(base_info['期货名称'].str.contains('主')) & (~base_info['期货名称'].str.contains('次'))]['行情ID'])

    # 2.收集历史
    history_dict = ef.futures.get_quote_history(cod_list)

    # 2.1 处理单个历史
    for k, v in history_dict.items():
        history_dict[k] = deal_func(v)

    # 3. 合并历史
    result = pd.concat(history_dict.values())

    # 4.返回结果
    return result


def ft(df: pd.DataFrame):
    col_name = '次日涨幅'
    df[col_name] = (df['收盘'].shift(-1) - df['收盘']) / df['收盘'] * 100
    df = df[df[col_name].notna()]
    return df

In [52]:
data = collect_future_day_history(ft)

data_dict = {
    'train': data[data['日期'] < '2024'],
    'test': data[data['日期'] >= '2024'],
}

del data

  0%|          | 0/81 [00:00<?, ?it/s]

## 处理数据

In [53]:
data_dict['train'].head()

Unnamed: 0,期货名称,期货代码,日期,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌幅,涨跌额,换手率,次日涨幅
0,棉纱主连,CYM,2017-08-21,23005.0,22985.0,23080.0,22910.0,12544,1443187000.0,0.0,0.0,0.0,0.0,0.413313
1,棉纱主连,CYM,2017-08-22,23005.0,23080.0,23095.0,22985.0,6891,794015500.0,0.48,0.3,70.0,0.0,-0.043328
2,棉纱主连,CYM,2017-08-23,23080.0,23070.0,23155.0,23045.0,4539,524027600.0,0.48,0.11,25.0,0.0,-0.086693
3,棉纱主连,CYM,2017-08-24,23070.0,23050.0,23120.0,23010.0,2255,260057900.0,0.48,-0.17,-40.0,0.0,0.0
4,棉纱主连,CYM,2017-08-25,23040.0,23050.0,23095.0,23030.0,1506,173641800.0,0.28,-0.07,-15.0,0.0,0.10846


In [54]:
def deal_data(dd, func_list):
    for k in dd:
        for func in func_list:
            dd[k] = func(dd[k], k)


def make_flag(odf, k):
    df = odf.copy()
    df.loc[df['次日涨幅'] > 0, '次日涨幅'] = 1
    df.loc[df['次日涨幅'] <= 0, '次日涨幅'] = 0
    df['次日涨幅'] = df['次日涨幅'].astype(int)
    return df


def make_ft(odf, k):
    df = odf.copy()

    df['价格中间线'] = (df['收盘'] - df['开盘']) / 2
    df['开盘-价格中间线'] = df['开盘'] - df['价格中间线']
    df['收盘-价格中间线'] = df['收盘'] - df['价格中间线']
    df['开盘-价格中间线@收盘_幅'] = df['开盘-价格中间线'] / df['开盘']
    df['收盘-价格中间线@收盘_幅'] = df['收盘-价格中间线'] / df['收盘']

    df['收盘_上攀幅'] = (df['最高'] - df['收盘']) / df['收盘']
    df['收盘_下攀幅'] = (df['最低'] - df['收盘']) / df['收盘']
    df['收盘_上下攀幅差'] = df['收盘_上攀幅'] - df['收盘_下攀幅']

    df['开盘_上攀幅'] = (df['最高'] - df['开盘']) / df['开盘']
    df['开盘_下攀幅'] = (df['最低'] - df['开盘']) / df['开盘']
    df['开盘_上下攀幅差'] = df['开盘_上攀幅'] - df['开盘_下攀幅']

    df['收盘_上攀幅/成交量'] = df['收盘_上攀幅'] / df['成交量']
    df['收盘_下攀幅/成交量'] = df['收盘_下攀幅'] / df['成交量']

    df['振幅/涨跌幅'] = df['振幅'] / df['涨跌幅']

    df['收盘_上攀幅/涨跌幅'] = df['收盘_上攀幅'] / df['涨跌幅']
    df['收盘_下攀幅/涨跌幅'] = df['收盘_下攀幅'] / df['涨跌幅']
    df['收盘_上下攀幅差/涨跌幅'] = df['收盘_上下攀幅差'] / df['涨跌幅']

    df['开盘_上攀幅/涨跌幅'] = df['开盘_上攀幅'] / df['涨跌幅']
    df['开盘_下攀幅/涨跌幅'] = df['开盘_下攀幅'] / df['涨跌幅']
    df['开盘_上下攀幅差/涨跌幅'] = df['开盘_上下攀幅差'] / df['涨跌幅']

    return df


def clean_data(odf, k):
    df = odf.copy()
    del_col_list = ['期货代码', '日期', '换手率']
    for del_col in del_col_list:
        del df[del_col]
    return df


def make_category(odf, k):
    df = odf.copy()
    for object_col in df.select_dtypes(include=['object']).columns.to_list():
        df[object_col] = df[object_col].astype('category')
    return df

In [55]:
deal_data(data_dict, [make_flag, clean_data, make_ft, make_category])

In [56]:
data_dict['train'].shape

(173883, 31)

## 使用数据

In [57]:
from typing import Dict, Any, List
import sklearn
import lightgbm as lgb


def train(
        x: pd.DataFrame,
        y: pd.Series,
        n_folds: int = 5,
        params: dict = None,
        feval: Callable = None,
        num_boost_round: int = 100,
        seed: int = 2024,
):
    if params is None:
        params = {}
    result = {}
    stratified_k_fold = sklearn.model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=seed)
    for k, (train_idx, val_idx) in enumerate(stratified_k_fold.split(x, y)):
        # 获取本折数据
        print(f'------------ {k} ------------')
        object_cols = x.select_dtypes(include=['category']).columns.to_list()
        all_data = lgb.Dataset(x, y)
        train_data = lgb.Dataset(x.iloc[train_idx], y.iloc[train_idx], categorical_feature=object_cols)
        val_data = lgb.Dataset(x.iloc[val_idx], y.iloc[val_idx], categorical_feature=object_cols)
        # 开始训练并记录训练数据
        eval_result = {}
        gbm = lgb.train(
            params=params,
            train_set=train_data,
            valid_sets=[train_data, val_data],
            valid_names=['train', 'val'],
            feval=feval if feval is not None else None,
            callbacks=[lgb.log_evaluation(int(num_boost_round / 5)), lgb.record_evaluation(eval_result)],
            num_boost_round=num_boost_round,
        )
        best_score = dict(gbm.best_score['val'])
        print(f'best-best_iteration:[{gbm.best_iteration}], best-score[{best_score}]]')
        result[k] = {
            'gbm': gbm,
            'eval': eval_result,
            'feature_importance': pd.DataFrame({'feature_name': gbm.feature_name(), f'{k}': gbm.feature_importance()})
        }
    return result


def show_importance(tr: Dict):
    result = pd.DataFrame()
    result['feature_name'] = tr[0]['feature_importance']['feature_name']
    for k, v in tr.items():
        result = pd.merge(result, v['feature_importance'], on='feature_name', how='left')
    result['feature_importance'] = result[[str(x) for x in range(len(tr))]].mean(axis=1)
    result.sort_values(by='feature_importance', inplace=True, ascending=False)
    return result


def pred(
        tr: Dict[str, Any],
        x: pd.DataFrame,
        y: pd.Series = None,
        n_folds: int = 5,
        check_func_list: List[Callable] = None,
):
    result = x.copy()
    result['pred-mean'] = 0
    for k, v in tr.items():
        result['pred-mean'] += (v['gbm'].predict(x) / n_folds)
    if y is not None:
        result['real'] = y
        if check_func_list is not None:
            for check_func in check_func_list:
                print('回测结果', check_func(result['pred-mean'], result['real']))
    return result


def roc_auc(y_pred, y_true):
    func_name = 'ROC_AUC'
    score = sklearn.metrics.roc_auc_score(y_true, y_pred)
    is_higher_better = True
    return func_name, score, is_higher_better


def ks(y_pred, y_true):
    func_name = 'KS'
    fpr, tpr, thr = sklearn.metrics.roc_curve(y_true, y_pred)
    score = max(tpr - fpr)
    is_higher_better = True
    return func_name, score, is_higher_better


In [58]:
kn = 5

train_result = train(
    x=data_dict['train'].iloc[:, ~data_dict['train'].columns.isin(['次日涨幅'])],
    y=data_dict['train']['次日涨幅'],
    n_folds=kn,
    params={
        'objective': 'binary',
        'metric': ['auc'],
        'verbose': -1,
        'n_jobs': 10,
        'learning_rate': 0.01,
        'early_stopping_round': 300,
    },
    num_boost_round=1000,
)

display(show_importance(train_result))

pred_result = pred(
    tr=train_result,
    x=data_dict['test'].iloc[:, ~data_dict['test'].columns.isin(['次日涨幅'])],
    y=data_dict['test']['次日涨幅'],
    n_folds=kn,
    check_func_list=[roc_auc, ks]
)

------------ 0 ------------
[200]	train's auc: 0.580975	val's auc: 0.541837
[400]	train's auc: 0.600552	val's auc: 0.54261
[600]	train's auc: 0.617585	val's auc: 0.542241
best-best_iteration:[325], best-score[{'auc': 0.5429455201127424}]]
------------ 1 ------------
[200]	train's auc: 0.580198	val's auc: 0.539485
[400]	train's auc: 0.598672	val's auc: 0.54049
[600]	train's auc: 0.615452	val's auc: 0.540627
[800]	train's auc: 0.631317	val's auc: 0.54079
[1000]	train's auc: 0.645034	val's auc: 0.540445
best-best_iteration:[758], best-score[{'auc': 0.5409610463036226}]]
------------ 2 ------------
[200]	train's auc: 0.580844	val's auc: 0.539929
[400]	train's auc: 0.602184	val's auc: 0.540043
[600]	train's auc: 0.619107	val's auc: 0.539664
best-best_iteration:[303], best-score[{'auc': 0.54033460319239}]]
------------ 3 ------------
[200]	train's auc: 0.581096	val's auc: 0.538222
[400]	train's auc: 0.601632	val's auc: 0.538742
[600]	train's auc: 0.61835	val's auc: 0.538944
[800]	train's auc

Unnamed: 0,feature_name,0,1,2,3,4,feature_importance
0,期货名称,2486,3818,2502,3235,4543,3316.8
8,涨跌幅,613,1320,568,721,1559,956.2
22,收盘_下攀幅/成交量,449,1166,473,851,1562,900.2
16,收盘_下攀幅,402,1209,387,839,1452,857.8
15,收盘_上攀幅,529,957,524,742,1522,854.8
18,开盘_上攀幅,453,1113,446,606,1569,837.4
21,收盘_上攀幅/成交量,395,957,310,696,1352,742.0
6,成交额,387,1000,335,624,1327,734.6
13,开盘-价格中间线@收盘_幅,312,859,324,624,1160,655.8
19,开盘_下攀幅,327,896,300,589,1143,651.0


回测结果 ('ROC_AUC', 0.5275127055771809, True)
回测结果 ('KS', 0.04548913889018613, True)


In [62]:
pred_result['pred-mean'].max()

0.6550313340857825