In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import os


import warnings
warnings.filterwarnings('ignore')

# 数据读取

In [2]:
base_path = 'data'
df_zhuli = pd.read_csv(os.path.join(base_path, 'zhuli.csv'))

data_path = os.path.join(base_path, '1minute')
df_min = []
for path in tqdm(os.listdir(data_path)):
    try:
        df = pd.read_csv(os.path.join(data_path, path))
        df.columns = ['datetime', 'open', 'high', 'low', 'close', 'volume', 'open_oi', 'close_oi']
        df['ts_code'] = path[:-4]
        df_min.append(df)
    except:
        print('wrong file: %s' % path)
#     break
df_min = pd.concat(df_min)
df_min = df_min.reset_index(drop=True)

df_min['datetime'] = pd.to_datetime(df_min['datetime'], infer_datetime_format=True)
index_df = pd.read_csv(os.path.join(base_path, 'index.csv'))

  6%|▌         | 23/374 [00:00<00:13, 26.08it/s]

wrong file: CZCE.JR003.csv


100%|██████████| 374/374 [00:10<00:00, 35.45it/s]


In [3]:
df_day = []
data_path = os.path.join(base_path, 'day')
for path in tqdm(os.listdir(data_path)):
    df = pd.read_csv(os.path.join(data_path, path))
    df['ts_code'] = path[:-4]
    df_day.append(df)
df_day = pd.concat(df_day)

# 有些合约一出来就是主力合约，没有pre_close，用open替换
idx = df_day['pre_close'].isna()
df_day.loc[idx, 'pre_close'] = df_day.loc[idx, 'open']
df_day = df_day[df_day['vol']>=500]
df_day = df_day.reset_index(drop=True)
df_day = df_day.rename(columns={'high':'today_high', 'low':'today_low'})

100%|██████████| 374/374 [00:01<00:00, 202.26it/s]


In [4]:
# 所有的交易日
day_list = sorted(df_day['trade_date'].unique())
day2idx = dict(zip(day_list, range(len(day_list))))
idx2day = dict(zip(range(len(day_list)), day_list))
day_set = set(day_list)

# 获取所属交易日， 在21:00至00:00属于下个交易日
def get_day(x):
    # 如果当天大于20点，算新的一天
    day = x.year * 10000 + x.month * 100 + x.day
    if x.hour > 20:
        idx = day2idx[day]
        day = idx2day[idx + 1]

    # 在凌晨的时候，周六凌晨也会交易
    if x.hour < 8:
        if day not in day_set:
            # 说明是在非工作日凌晨交易,并且属于下一个交易日
            x = x - pd.Timedelta(days=1)
            day = x.year * 10000 + x.month * 100 + x.day
            idx = day2idx[day]
            day = idx2day[idx + 1]

    return day

df_min['trade_date'] = df_min['datetime'].apply(get_day)
df_min = df_min.merge(df_day[['ts_code', 'trade_date', 'pre_close', 'today_high', 'today_low']], on=['ts_code', 'trade_date'], how='left')
# 剔除无昨日开盘价数据，这类数据几乎整天无交易
df_min = df_min[~df_min['pre_close'].isna()].reset_index(drop=True)
df_min['mean'] = (df_min['high'] + df_min['low'] + df_min['close'] + df_min['open']) / 4
df_min['rate'] = (df_min['mean'] - df_min['pre_close']) / df_min['pre_close']

# index feature

In [5]:
index_df = index_df.sort_values(['datetime'], ascending=[True])
index_df.columns = ['datetime', 'index_rate']
for day in [1, 2, 3, 4]:
    index_df ['index_rate_shift'+str(day)] = (index_df['index_rate'].shift(day) - index_df['index_rate']) / index_df['index_rate']
index_df['datetime'] = pd.to_datetime(index_df['datetime'], infer_datetime_format=True)

# normal feature

In [6]:
df_list = []
# num = 0
for i, g in tqdm(df_min.groupby(['ts_code'])):
#     num += 1
#     if num % 200 == 199:
#         print(num)
    g = g.sort_values(['datetime'], ascending=[True]).reset_index(drop=True)

    col = ['ts_code', 'datetime']
    for day in [5, 10, 15, 30, 60]:
        g['max_' + str(day)] = g['high'].rolling(day).max()
        g['min_' + str(day)] = g['low'].rolling(day).min()
        g['mean_' + str(day)] = g['mean'].rolling(day).mean()
        g['volume_' + str(day)] = g['volume'].rolling(day).sum()

        col.append('max_' + str(day))
        col.append('min_' + str(day))
        col.append('mean_' + str(day))
        col.append('volume_' + str(day))

    # 再处理
    for day in [5, 10, 15, 30, 60]:
        g['max_' + str(day)] = (g['mean'] - g['max_' + str(day)]) / g['max_' + str(day)]
        g['min_' + str(day)] = (g['mean'] - g['min_' + str(day)]) / g['min_' + str(day)]
        g['mean_' + str(day)] = (g['mean'] - g['mean_' + str(day)]) / g['mean_' + str(day)]
        g['volume_' + str(day)] = (g['volume'] - g['volume_' + str(day)]) / g['volume_' + str(day)]


    for day in [1, 2, 3]:
        g['high_shift' + str(day)] = (g['high'].shift(day) - g['mean']) / g['mean']
        g['low_shift' + str(day)] = (g['low'].shift(day) - g['mean']) / g['mean']
        g['mean_shift' + str(day)] = (g['mean'].shift(day) - g['mean']) / g['mean']

        g['high_shift2' + str(day)] = (g['high'].shift(day) - g['pre_close']) / g['pre_close']
        g['low_shift2' + str(day)] = (g['low'].shift(day) - g['pre_close']) / g['pre_close']
        g['mean_shift2' + str(day)] = (g['mean'].shift(day) - g['pre_close']) / g['pre_close']

        col.append('high_shift' + str(day))
        col.append('low_shift' + str(day))
        col.append('mean_shift' + str(day))

        col.append('high_shift2' + str(day))
        col.append('low_shift2' + str(day))
        col.append('mean_shift2' + str(day))

    df_list.append(g[col])

df_list = pd.concat(df_list)

100%|██████████| 338/338 [00:27<00:00, 12.16it/s]


In [7]:
df_min = df_min.merge(df_list, on=['ts_code', 'datetime'], how='left')

# def MakeFeature(df):
df_min['high_t'] = (df_min['high'] - df_min['pre_close']) / df_min['pre_close']
df_min['low_t'] = (df_min['low']-df_min['pre_close']) / df_min['pre_close']
df_min['open_t'] = (df_min['open']-df_min['pre_close']) / df_min['pre_close']

col.append('high_t')
col.append('low_t')
col.append('open_t')

df_min = df_min.merge(index_df, on=['datetime'], how='left')

col += ['index_rate', 'index_rate_shift1', 'index_rate_shift2', 'index_rate_shift3', 'index_rate_shift4']

In [8]:
def get_time(x):
    # 夜盘21:00-2:30
    h = x.hour
    if h > 20 and h < 24:
        t = (h-21)*60 + x.minute
    elif h >= 0 and h < 3:
        t = 180 + h*60 + x.minute
    elif h > 8:
        t = 330 + (h-9)*60 + x.minute
    return t
df_min['time'] = df_min['datetime'].apply(get_time)

# Make Label

In [9]:
def get_label(df_min):
    # 20 分钟后的涨幅，且日内交易
    delta_t = 20
    df_label = []
    tmp_col = ['ts_code', 'trade_date', 'datetime', 'mean', 'low', 'high']
    for i, g in tqdm(df_min[tmp_col].groupby(['ts_code', 'trade_date'])):
        g = g.sort_values('datetime', ascending=True).reset_index(drop=True)
        g['mean_shift1'] = g['mean'].shift(-1)
        g['mean_latter'] = g['mean'].shift(-delta_t)
        g['datetime_open'] = g['datetime'].shift(-1) # 开仓时间
        g['datetime_close'] = g['datetime'].shift(-delta_t) # 平仓时间
        g['lowest'] = g.rolling(delta_t)['low'].min().shift(-delta_t)
        g['highest'] = g.rolling(delta_t)['high'].max().shift(-delta_t)
        df_label.append(g)
    df_label = pd.concat(df_label)
    return df_label

if os.path.exists(os.path.join(base_path, 'label.csv')):
    df_label = pd.read_csv(os.path.join(base_path, 'label.csv'))
else:
    df_label = get_label(df_min)
    df_label.to_csv(os.path.join(base_path, 'label.csv'), index=None)
df_label['datetime'] = pd.to_datetime(df_label['datetime'], infer_datetime_format=True)
# df_label['label'] = df_label['mean_latter'] >= df_label['mean']
df_label['label'] = df_label['mean_latter'] >= df_label['mean_shift1']
df_label['return'] = (df_label['mean_latter'] - df_label['mean_shift1']) / df_label['mean_shift1']
df_min = df_min.merge(df_label[['datetime', 'ts_code', 'label', 'return', 'datetime_open', 'datetime_close']], on=['datetime', 'ts_code'], how='left')

In [10]:
df_min.dropna(inplace=True)

# train

In [11]:
# 假设 分钟行情的最低价、最高价、都等于当日最高价视为涨停或跌停
df_min['limit'] = 0
idx = (df_min['low']==df_min['high'])&((df_min['low']==df_min['today_low'])|(df_min['high']==df_min['today_high']))
df_min.loc[idx, 'limit'] = 1

In [12]:
feature_col = col[2:]
print(len(feature_col))

46


In [13]:
label_col = 'label'

trn_date_min = 20190101
trn_date_max = 20190901

test_date_min = 20190902
test_date_max = 20200501

# 训练集中需要提出涨幅为0（可能为涨停或跌停）

trn_idx = (df_min['trade_date'] >= trn_date_min) & (df_min['trade_date'] <= trn_date_max) & (df_min['return']!=0)
test_idx = (df_min['trade_date'] >= test_date_min) & (df_min['trade_date'] <= test_date_max) & (df_min['limit']==0)

trn = df_min[trn_idx][feature_col].values
trn_label = df_min[trn_idx][label_col].values

test = df_min[test_idx][feature_col].values
test_label = df_min[test_idx][label_col].values

In [14]:
num_1 = np.sum(trn_label)
num_0 = len(trn_label)-np.sum(trn_label)
print('0:%d, 1:%d' %(num_0, num_1))
print('rate 0:%.3f, rate 1:%.3f' % (num_0/len(trn_label), num_1/len(trn_label)))
print('len trn:%d, len test:%d' % (len(trn), len(test)))

0:1093408, 1:1090112
rate 0:0.501, rate 1:0.499
len trn:2183520, len test:2065200


In [27]:
# 模型训练及评价
import lightgbm as lgb
import time
t1 = time.time()
from sklearn import metrics
param = {'num_leaves': 31,
         'min_data_in_leaf': 20,
         'objective': 'binary',
         'learning_rate': 0.06,
         "boosting": "gbdt",
         "metric": 'None',
         "verbosity": -1}
trn_data = lgb.Dataset(trn, trn_label)
num_round =888

clf = lgb.train(param, trn_data, 400, verbose_eval=300)
t2 = time.time()
print(t2-t1)

60.926990270614624


In [28]:
pd.DataFrame({
        'column': feature_col,
        'importance': clf.feature_importance(),
    }).sort_values(by='importance', ascending=False)

Unnamed: 0,column,importance
41,index_rate,2174
45,index_rate_shift4,740
16,max_60,670
17,min_60,663
18,mean_60,596
42,index_rate_shift1,499
43,index_rate_shift2,483
44,index_rate_shift3,472
12,max_30,472
13,min_30,365


# 模型保存

In [29]:
if not os.path.exists('model'):
    os.path.mkdir('model')
import pickle
save_path = 'model/model.pickle'
with open(save_path, 'wb') as f:
    pickle.dump(clf, f)

# 结果分析

In [35]:
test_df = df_min[test_idx][['datetime', 'ts_code', label_col, 'return', 'mean', 'datetime_open', 'datetime_close']]
test_lgb = clf.predict(test, num_iteration=clf.best_iteration)
test_df['pred'] = test_lgb

In [36]:
test_df['pred2'] = test_df['pred']>0.8
tp = np.sum((test_df['pred2']==True)&(test_df['return']>0))
pp = np.sum(test_df['pred2']==True)
precison1 = tp/(pp+0.001)
print('tp:%d, pp:%d, precision1: %.4f' % (tp, pp, precison1))
test_df[test_df['pred2']==True]['return'].mean()

tp:1128, pp:2067, precision1: 0.5457


0.0007325614776948953

In [37]:
test_df['pred2'] = test_df['pred']>0.2
tn = np.sum((test_df['pred2']==False)&(test_df['return']<0))
pn = np.sum(test_df['pred2']==False)
precison2 = tn/(pn+0.001)
print('tn:%d, pn:%d, precision2: %.4f' % (tn, pn, precison2))
test_df[test_df['pred2']==False]['return'].mean()

tn:62, pn:83, precision2: 0.7470


-0.0015078313189174908

In [39]:
# 结果保存
test_df.to_csv(os.path.join(base_path, 'result.csv'), index=None)