<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#作戦メモ" data-toc-modified-id="作戦メモ-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>作戦メモ</a></span></li><li><span><a href="#前処理" data-toc-modified-id="前処理-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>前処理</a></span></li><li><span><a href="#優秀なトレーニング期間の探索" data-toc-modified-id="優秀なトレーニング期間の探索-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>優秀なトレーニング期間の探索</a></span></li></ul></div>

# 概要
アンサンブルモデルで予測する。  
銘柄コードを使わない。

In [1]:
import numpy as np
import lightgbm as lgb
import pandas as pd
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import random
from datetime import datetime, date
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import time
from joblib import Parallel, delayed
from tqdm import tqdm
from dateutil.parser import parse
import gc

import datetime
from dateutil.relativedelta import relativedelta
from IPython import embed

In [2]:
#データ取得コマンド
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
print('Done!')

Loading the data... This could take a minute.
Done!
Done!


In [3]:
market_train_df = env._var07
news_train_df = env._var10

In [4]:
config_model = {"n_lag":[3,7,14], # 時系列特徴量計算の周期リスト
                
                # 時系列特徴量計算のリスト
                "return_features" : ['returnsClosePrevMktres10','returnsClosePrevRaw10','open','close'],
                
                "validation_term" : ['2017-01-01', '2018-12-31'],
                
                "model_params": [{ # パラメータ１
                                    'task': 'train',
                                    'boosting_type': 'gbdt',
                                    'objective': 'binary',
                                    'learning_rate': 0.05,
                                    'num_leaves': 2400,
                                    'min_data_in_leaf': 150,
                                    'num_iteration': 1000,
                                    'max_bin': 200,
                                    'verbose': 1,
                                    'metric' : 'None'  # カスタム評価関数のみ使う
                                },
                                { # パラメータ２
                                    'task': 'train',
                                    'boosting_type': 'gbdt',
                                    'objective': 'binary',
                                    'learning_rate': 0.048,
                                    'num_leaves': 2300,
                                    'min_data_in_leaf': 150,
                                    'num_iteration': 1000,
                                    'max_bin': 200,
                                    'verbose': 1,
                                    'metric' : 'None'  # カスタム評価関数のみ使う
                                }]
               }


# 前処理

In [5]:
market_train_df['time'] = market_train_df['time'].dt.date

In [6]:
def create_lag(df_code,n_lag, return_features):

    #df_codeの資産コードを重複なしでcodeに入れる
    code = df_code['assetCode'].unique()
    
    # データフレーム高速生成のための辞書
    _dict_for_df = {"time": df_code["time"].values,
                    "assetCode": df_code["assetCode"].values}
    shift_size = 1
    for col in return_features:
        for window in n_lag:
            rolled = df_code[col].shift(shift_size).rolling(window=window)
            #移動平均の平均
            lag_mean = rolled.mean()
            #最大値
            lag_max = rolled.max()
            #最小値
            lag_min = rolled.min()
            #標準偏差
            lag_std = rolled.std()
            #colの値_lag_windowの値_meanのカラム名でdf_codeに格納される
            _dict_for_df['%s_lag_%s_mean'%(col,window)] = lag_mean
            _dict_for_df['%s_lag_%s_max'%(col,window)] = lag_max
            _dict_for_df['%s_lag_%s_min'%(col,window)] = lag_min
    df_result = pd.DataFrame.from_dict(_dict_for_df)
    del df_code
    return df_result.fillna(-1)


def generate_lag_features(df,n_lag, return_features):
    all_df = Parallel(n_jobs=-1)(tqdm([delayed(create_lag)(_df, n_lag, return_features) for _code, _df in df.groupby('assetCode')]))
    new_df = pd.concat(all_df)
    del df
    gc.collect()
    return new_df

def data_prep(market_train):
    lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
    market_train['assetCodeT'] = market_train['assetCode'].map(lbl)
    market_train = market_train.dropna(axis=0)
    return market_train

def mis_impute(data):
    #カラムを一つずつ引っ張ってくる
    for i in data.columns:
        #型が文字列なら空値をotherに置き換え
        if data[i].dtype == "object":
            data[i] = data[i].fillna("other")
        #型が数字なら空値は平均値に置き換え
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data

def get_feature_cols(df):
    """
    特徴量とするカラムを取得する。
    """
    fcol = [c for c in df.columns if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'audiences', 
                                             'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'time_x','provider', 
                                             'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'universe','sourceTimestamp']]
    return fcol

In [7]:
new_df = generate_lag_features(market_train_df,
                               n_lag=config_model['n_lag'], 
                               return_features=config_model['return_features'])
market_train_df = pd.merge(market_train_df,new_df,how='left',on=['time','assetCode'], copy=False)
market_train_df = mis_impute(market_train_df)

# 銘柄コードなしで行ってみよう
# market_train_df = data_prep(market_train_df)

#up = market_train_df['returnsOpenNextMktres10'] >= 0

#universe(これが１以外は訓練データとして使えない)の値をuniverse変数に放り込む
#でもなんで値？
#universe = market_train_df['universe'].values


100%|██████████| 4125/4125 [00:58<00:00, 79.08it/s]


In [8]:
# メモリ節約
for _ in market_train_df.columns:
    if market_train_df[_].dtype == np.float64:
        market_train_df[_] = market_train_df[_].astype(np.float32)
gc.collect()

14

In [9]:
# 特徴量カラム
feature_cols = get_feature_cols(market_train_df)

# インデックス情報
idx_validate = market_train_df[(market_train_df['time'] >= parse(config_model['validation_term'][0]).date())
                               & (market_train_df['time'] <= parse(config_model['validation_term'][1]).date())].index

In [10]:
# バリデーションデータの用意
X_valid = market_train_df.loc[idx_validate][feature_cols].values
up_valid = market_train_df.loc[idx_validate]['returnsOpenNextMktres10'] > 0
validation_data = lgb.Dataset(X_valid, label=up_valid.astype(int),free_raw_data=False,
                             params={"return":market_train_df.loc[idx_validate]['returnsOpenNextMktres10'].values, 
                                     "time":market_train_df.loc[idx_validate]['time'].values,
                                     "universe": market_train_df.loc[idx_validate]['universe'].values})
gc.collect()

0

In [11]:
def get_training_data(market_train_df, idx_training, feature_cols):
    """
    トレーニングデータを取得する
    """
    return ( market_train_df.loc[idx_training][feature_cols].values,   # X_train
             market_train_df.loc[idx_training]['returnsOpenNextMktres10'] > 0, # up_train
           )

def eval_sigma_score(preds, valid_set):
    """
    予測とバリデーションセットを受け取り、sigma_scoreを計算する。
    """
    _df_valid = pd.DataFrame.from_dict({"time":valid_set.params['time'],
                                       "x_t_i": (2 * preds - 1) * valid_set.params['universe'] * valid_set.params['return']})
    x_t = _df_valid.groupby("time").agg({"x_t_i":"sum"}).values.flatten()
    #embed()
    _mean = x_t.mean()
    _std = x_t.std()
    return ('sigma_score', np.float32(_mean/_std), True)

def build_models(model_params, train_data, validation_data):
    """
    トレーニングデータとバリデーションデータを受け取り、モデルを構築する。
    """
    models = []
    #global config_model
    for _param in model_params:
        _evals_result = {}
        _model = lgb.train(_param, train_data,
                          num_boost_round=200,
                          valid_sets=validation_data,
                          early_stopping_rounds=10,
                          feval=eval_sigma_score,
                          evals_result=_evals_result)
        # 最適ブースティング情報を得る
        _best_boosting_round = _model.best_iteration
        _best_sigma_score = np.max(_evals_result['valid_0']['sigma_score'])
        models.append([_model, _best_boosting_round, _best_sigma_score, _evals_result])
    return models

In [12]:
ensemble_model_config = [{'training_term': [datetime.date(2015, 9, 20), datetime.date(2016, 9, 20)],
  'boost_rounds': [54, 20],
  'best_score': [0.6928928, 0.6779268]},
 {'training_term': [datetime.date(2012, 4, 22), datetime.date(2013, 4, 22)],
  'boost_rounds': [17, 11],
  'best_score': [0.68496335, 0.678296]},
 {'training_term': [datetime.date(2012, 1, 29), datetime.date(2013, 1, 29)],
  'boost_rounds': [6, 5],
  'best_score': [0.68295014, 0.6830031]},
 {'training_term': [datetime.date(2015, 10, 4), datetime.date(2016, 10, 4)],
  'boost_rounds': [25, 15],
  'best_score': [0.6777176, 0.6654344]},
 {'training_term': [datetime.date(2012, 1, 15), datetime.date(2013, 1, 15)],
  'boost_rounds': [13, 19],
  'best_score': [0.6750046, 0.671297]},
 {'training_term': [datetime.date(2012, 5, 6), datetime.date(2013, 5, 6)],
  'boost_rounds': [14, 13],
  'best_score': [0.67426574, 0.6646366]},
 {'training_term': [datetime.date(2012, 4, 8), datetime.date(2013, 4, 8)],
  'boost_rounds': [7, 11],
  'best_score': [0.6725701, 0.6871941]},
 {'training_term': [datetime.date(2012, 1, 1), datetime.date(2013, 1, 1)],
  'boost_rounds': [15, 28],
  'best_score': [0.66854, 0.6764855]},
 {'training_term': [datetime.date(2015, 9, 6), datetime.date(2016, 9, 6)],
  'boost_rounds': [4, 5],
  'best_score': [0.6671772, 0.671724]},
 {'training_term': [datetime.date(2012, 6, 17), datetime.date(2013, 6, 17)],
  'boost_rounds': [11, 18],
  'best_score': [0.66430104, 0.66094226]},
 {'training_term': [datetime.date(2012, 6, 3), datetime.date(2013, 6, 3)],
  'boost_rounds': [19, 13],
  'best_score': [0.6633725, 0.66193634]},
 {'training_term': [datetime.date(2012, 7, 1), datetime.date(2013, 7, 1)],
  'boost_rounds': [18, 21],
  'best_score': [0.65886754, 0.668467]},
 {'training_term': [datetime.date(2015, 8, 9), datetime.date(2016, 8, 9)],
  'boost_rounds': [11, 9],
  'best_score': [0.65750855, 0.65420794]},
 {'training_term': [datetime.date(2012, 5, 20), datetime.date(2013, 5, 20)],
  'boost_rounds': [14, 14],
  'best_score': [0.653869, 0.65629095]},
 {'training_term': [datetime.date(2012, 7, 29), datetime.date(2013, 7, 29)],
  'boost_rounds': [15, 12],
  'best_score': [0.6535106, 0.64185745]},
 {'training_term': [datetime.date(2012, 2, 26), datetime.date(2013, 2, 26)],
  'boost_rounds': [5, 5],
  'best_score': [0.65312153, 0.65791017]},
 {'training_term': [datetime.date(2015, 10, 18), datetime.date(2016, 10, 18)],
  'boost_rounds': [13, 23],
  'best_score': [0.6525164, 0.64032394]},
 {'training_term': [datetime.date(2012, 3, 25), datetime.date(2013, 3, 25)],
  'boost_rounds': [19, 18],
  'best_score': [0.6516725, 0.6533877]},
 {'training_term': [datetime.date(2012, 12, 16), datetime.date(2013, 12, 16)],
  'boost_rounds': [7, 7],
  'best_score': [0.6513744, 0.6555382]},
 {'training_term': [datetime.date(2012, 7, 15), datetime.date(2013, 7, 15)],
  'boost_rounds': [19, 17],
  'best_score': [0.6470148, 0.6473995]},
 {'training_term': [datetime.date(2015, 11, 1), datetime.date(2016, 11, 1)],
  'boost_rounds': [12, 11],
  'best_score': [0.64515686, 0.6359645]},
 {'training_term': [datetime.date(2015, 7, 26), datetime.date(2016, 7, 26)],
  'boost_rounds': [18, 21],
  'best_score': [0.6446428, 0.63842595]},
 {'training_term': [datetime.date(2012, 2, 12), datetime.date(2013, 2, 12)],
  'boost_rounds': [8, 6],
  'best_score': [0.64383376, 0.6332524]},
 {'training_term': [datetime.date(2012, 3, 11), datetime.date(2013, 3, 11)],
  'boost_rounds': [6, 9],
  'best_score': [0.64314014, 0.6392905]},
 {'training_term': [datetime.date(2015, 6, 28), datetime.date(2016, 6, 28)],
  'boost_rounds': [43, 36],
  'best_score': [0.6405408, 0.6400905]},
 {'training_term': [datetime.date(2012, 12, 30), datetime.date(2013, 12, 30)],
  'boost_rounds': [4, 6],
  'best_score': [0.6347531, 0.63354534]},
 {'training_term': [datetime.date(2015, 12, 27), datetime.date(2016, 12, 27)],
  'boost_rounds': [34, 31],
  'best_score': [0.63352257, 0.6484244]},
 {'training_term': [datetime.date(2015, 12, 13), datetime.date(2016, 12, 13)],
  'boost_rounds': [18, 8],
  'best_score': [0.6331231, 0.62229174]},
 {'training_term': [datetime.date(2012, 12, 2), datetime.date(2013, 12, 2)],
  'boost_rounds': [10, 8],
  'best_score': [0.6313182, 0.6301248]},
 {'training_term': [datetime.date(2015, 8, 23), datetime.date(2016, 8, 23)],
  'boost_rounds': [30, 28],
  'best_score': [0.63035077, 0.64056605]},
 {'training_term': [datetime.date(2012, 8, 12), datetime.date(2013, 8, 12)],
  'boost_rounds': [10, 10],
  'best_score': [0.6293325, 0.62498826]},
 {'training_term': [datetime.date(2015, 5, 31), datetime.date(2016, 5, 31)],
  'boost_rounds': [44, 51],
  'best_score': [0.6269503, 0.63117033]},
 {'training_term': [datetime.date(2015, 11, 29), datetime.date(2016, 11, 29)],
  'boost_rounds': [20, 16],
  'best_score': [0.62625057, 0.62975204]},
 {'training_term': [datetime.date(2013, 1, 13), datetime.date(2014, 1, 13)],
  'boost_rounds': [5, 4],
  'best_score': [0.6258578, 0.6291289]},
 {'training_term': [datetime.date(2015, 7, 12), datetime.date(2016, 7, 12)],
  'boost_rounds': [17, 47],
  'best_score': [0.623877, 0.6330257]},
 {'training_term': [datetime.date(2012, 11, 18), datetime.date(2013, 11, 18)],
  'boost_rounds': [10, 11],
  'best_score': [0.6178253, 0.6123482]},
 {'training_term': [datetime.date(2015, 5, 17), datetime.date(2016, 5, 17)],
  'boost_rounds': [55, 45],
  'best_score': [0.61293584, 0.6016896]},
 {'training_term': [datetime.date(2013, 6, 16), datetime.date(2014, 6, 16)],
  'boost_rounds': [4, 4],
  'best_score': [0.6128596, 0.6130355]},
 {'training_term': [datetime.date(2012, 8, 26), datetime.date(2013, 8, 26)],
  'boost_rounds': [12, 13],
  'best_score': [0.6124136, 0.6185323]},
 {'training_term': [datetime.date(2013, 3, 24), datetime.date(2014, 3, 24)],
  'boost_rounds': [8, 10],
  'best_score': [0.6121072, 0.59619963]},
 {'training_term': [datetime.date(2015, 6, 14), datetime.date(2016, 6, 14)],
  'boost_rounds': [47, 48],
  'best_score': [0.6118877, 0.6200411]},
 {'training_term': [datetime.date(2013, 3, 10), datetime.date(2014, 3, 10)],
  'boost_rounds': [8, 6],
  'best_score': [0.61071867, 0.60738546]},
 {'training_term': [datetime.date(2013, 2, 24), datetime.date(2014, 2, 24)],
  'boost_rounds': [6, 5],
  'best_score': [0.610208, 0.61765623]},
 {'training_term': [datetime.date(2013, 1, 27), datetime.date(2014, 1, 27)],
  'boost_rounds': [4, 11],
  'best_score': [0.6099217, 0.6014231]},
 {'training_term': [datetime.date(2015, 3, 22), datetime.date(2016, 3, 22)],
  'boost_rounds': [62, 41],
  'best_score': [0.6098511, 0.6036138]},
 {'training_term': [datetime.date(2013, 4, 7), datetime.date(2014, 4, 7)],
  'boost_rounds': [7, 6],
  'best_score': [0.60949963, 0.60461646]},
 {'training_term': [datetime.date(2015, 11, 15), datetime.date(2016, 11, 15)],
  'boost_rounds': [27, 30],
  'best_score': [0.60828304, 0.6097183]},
 {'training_term': [datetime.date(2013, 9, 22), datetime.date(2014, 9, 22)],
  'boost_rounds': [56, 60],
  'best_score': [0.6037325, 0.595017]},
 {'training_term': [datetime.date(2012, 11, 4), datetime.date(2013, 11, 4)],
  'boost_rounds': [10, 13],
  'best_score': [0.60157907, 0.59852654]},
 {'training_term': [datetime.date(2013, 5, 5), datetime.date(2014, 5, 5)],
  'boost_rounds': [3, 3],
  'best_score': [0.6014604, 0.60113347]},
 {'training_term': [datetime.date(2013, 2, 10), datetime.date(2014, 2, 10)],
  'boost_rounds': [3, 3],
  'best_score': [0.5993807, 0.5979791]},
 {'training_term': [datetime.date(2015, 4, 5), datetime.date(2016, 4, 5)],
  'boost_rounds': [41, 35],
  'best_score': [0.5988152, 0.60255235]},
 {'training_term': [datetime.date(2013, 12, 15), datetime.date(2014, 12, 15)],
  'boost_rounds': [63, 26],
  'best_score': [0.5982619, 0.57175934]},
 {'training_term': [datetime.date(2013, 9, 8), datetime.date(2014, 9, 8)],
  'boost_rounds': [39, 51],
  'best_score': [0.5963604, 0.5951404]},
 {'training_term': [datetime.date(2013, 6, 30), datetime.date(2014, 6, 30)],
  'boost_rounds': [7, 2],
  'best_score': [0.5962737, 0.5890475]},
 {'training_term': [datetime.date(2014, 2, 9), datetime.date(2015, 2, 9)],
  'boost_rounds': [63, 66],
  'best_score': [0.59430116, 0.6014217]},
 {'training_term': [datetime.date(2013, 4, 21), datetime.date(2014, 4, 21)],
  'boost_rounds': [4, 4],
  'best_score': [0.59384084, 0.5955595]},
 {'training_term': [datetime.date(2013, 5, 19), datetime.date(2014, 5, 19)],
  'boost_rounds': [4, 8],
  'best_score': [0.5924876, 0.5865966]},
 {'training_term': [datetime.date(2013, 6, 2), datetime.date(2014, 6, 2)],
  'boost_rounds': [8, 7],
  'best_score': [0.5913741, 0.5913249]},
 {'training_term': [datetime.date(2014, 1, 12), datetime.date(2015, 1, 12)],
  'boost_rounds': [52, 65],
  'best_score': [0.5911786, 0.58791107]},
 {'training_term': [datetime.date(2015, 3, 8), datetime.date(2016, 3, 8)],
  'boost_rounds': [23, 43],
  'best_score': [0.59101737, 0.58037114]},
 {'training_term': [datetime.date(2013, 12, 29), datetime.date(2014, 12, 29)],
  'boost_rounds': [52, 100],
  'best_score': [0.5901284, 0.61013556]},
 {'training_term': [datetime.date(2015, 4, 19), datetime.date(2016, 4, 19)],
  'boost_rounds': [37, 36],
  'best_score': [0.5880456, 0.5809668]},
 {'training_term': [datetime.date(2014, 1, 26), datetime.date(2015, 1, 26)],
  'boost_rounds': [69, 74],
  'best_score': [0.58759856, 0.5849851]},
 {'training_term': [datetime.date(2013, 10, 6), datetime.date(2014, 10, 6)],
  'boost_rounds': [36, 95],
  'best_score': [0.5869878, 0.5938006]},
 {'training_term': [datetime.date(2012, 10, 7), datetime.date(2013, 10, 7)],
  'boost_rounds': [11, 11],
  'best_score': [0.5868606, 0.58348805]},
 {'training_term': [datetime.date(2012, 9, 23), datetime.date(2013, 9, 23)],
  'boost_rounds': [9, 8],
  'best_score': [0.5846448, 0.58078414]},
 {'training_term': [datetime.date(2012, 10, 21), datetime.date(2013, 10, 21)],
  'boost_rounds': [13, 11],
  'best_score': [0.58284897, 0.58650225]},
 {'training_term': [datetime.date(2013, 8, 25), datetime.date(2014, 8, 25)],
  'boost_rounds': [35, 25],
  'best_score': [0.5813714, 0.5685478]},
 {'training_term': [datetime.date(2014, 2, 23), datetime.date(2015, 2, 23)],
  'boost_rounds': [80, 70],
  'best_score': [0.57958627, 0.5732819]},
 {'training_term': [datetime.date(2015, 5, 3), datetime.date(2016, 5, 3)],
  'boost_rounds': [49, 61],
  'best_score': [0.57946604, 0.59161216]},
 {'training_term': [datetime.date(2013, 7, 28), datetime.date(2014, 7, 28)],
  'boost_rounds': [26, 35],
  'best_score': [0.579078, 0.58584476]},
 {'training_term': [datetime.date(2012, 9, 9), datetime.date(2013, 9, 9)],
  'boost_rounds': [10, 12],
  'best_score': [0.57800627, 0.57726645]},
 {'training_term': [datetime.date(2013, 8, 11), datetime.date(2014, 8, 11)],
  'boost_rounds': [27, 47],
  'best_score': [0.5771221, 0.59309345]},
 {'training_term': [datetime.date(2013, 7, 14), datetime.date(2014, 7, 14)],
  'boost_rounds': [32, 46],
  'best_score': [0.5752073, 0.59307665]},
 {'training_term': [datetime.date(2014, 11, 2), datetime.date(2015, 11, 2)],
  'boost_rounds': [29, 44],
  'best_score': [0.57405484, 0.58030176]},
 {'training_term': [datetime.date(2013, 11, 17), datetime.date(2014, 11, 17)],
  'boost_rounds': [51, 52],
  'best_score': [0.5727124, 0.563386]},
 {'training_term': [datetime.date(2013, 10, 20), datetime.date(2014, 10, 20)],
  'boost_rounds': [77, 32],
  'best_score': [0.5716642, 0.5728303]},
 {'training_term': [datetime.date(2014, 11, 16), datetime.date(2015, 11, 16)],
  'boost_rounds': [39, 17],
  'best_score': [0.5625889, 0.56524223]},
 {'training_term': [datetime.date(2014, 10, 19), datetime.date(2015, 10, 19)],
  'boost_rounds': [45, 36],
  'best_score': [0.561651, 0.5637296]},
 {'training_term': [datetime.date(2013, 12, 1), datetime.date(2014, 12, 1)],
  'boost_rounds': [38, 62],
  'best_score': [0.55703014, 0.56293327]},
 {'training_term': [datetime.date(2013, 11, 3), datetime.date(2014, 11, 3)],
  'boost_rounds': [38, 64],
  'best_score': [0.556944, 0.5671937]},
 {'training_term': [datetime.date(2014, 3, 9), datetime.date(2015, 3, 9)],
  'boost_rounds': [71, 65],
  'best_score': [0.5544495, 0.5604444]},
 {'training_term': [datetime.date(2014, 7, 13), datetime.date(2015, 7, 13)],
  'boost_rounds': [13, 7],
  'best_score': [0.54902244, 0.5511594]},
 {'training_term': [datetime.date(2014, 11, 30), datetime.date(2015, 11, 30)],
  'boost_rounds': [30, 26],
  'best_score': [0.54658854, 0.5499792]},
 {'training_term': [datetime.date(2014, 3, 23), datetime.date(2015, 3, 23)],
  'boost_rounds': [71, 70],
  'best_score': [0.54614383, 0.54619175]},
 {'training_term': [datetime.date(2014, 12, 14), datetime.date(2015, 12, 14)],
  'boost_rounds': [23, 32],
  'best_score': [0.53938556, 0.53524274]},
 {'training_term': [datetime.date(2015, 2, 8), datetime.date(2016, 2, 8)],
  'boost_rounds': [47, 30],
  'best_score': [0.53870577, 0.5300118]},
 {'training_term': [datetime.date(2014, 12, 28), datetime.date(2015, 12, 28)],
  'boost_rounds': [38, 48],
  'best_score': [0.5381294, 0.5376713]},
 {'training_term': [datetime.date(2015, 2, 22), datetime.date(2016, 2, 22)],
  'boost_rounds': [30, 37],
  'best_score': [0.53273463, 0.53644764]},
 {'training_term': [datetime.date(2015, 1, 11), datetime.date(2016, 1, 11)],
  'boost_rounds': [29, 44],
  'best_score': [0.5312279, 0.5324402]},
 {'training_term': [datetime.date(2014, 4, 6), datetime.date(2015, 4, 6)],
  'boost_rounds': [32, 36],
  'best_score': [0.5294655, 0.5369457]},
 {'training_term': [datetime.date(2014, 7, 27), datetime.date(2015, 7, 27)],
  'boost_rounds': [11, 31],
  'best_score': [0.5289819, 0.5332443]},
 {'training_term': [datetime.date(2015, 1, 25), datetime.date(2016, 1, 25)],
  'boost_rounds': [31, 30],
  'best_score': [0.52737236, 0.52900124]},
 {'training_term': [datetime.date(2014, 6, 29), datetime.date(2015, 6, 29)],
  'boost_rounds': [19, 23],
  'best_score': [0.52560997, 0.53129196]},
 {'training_term': [datetime.date(2014, 10, 5), datetime.date(2015, 10, 5)],
  'boost_rounds': [2, 2],
  'best_score': [0.5219602, 0.5211477]},
 {'training_term': [datetime.date(2014, 8, 10), datetime.date(2015, 8, 10)],
  'boost_rounds': [27, 29],
  'best_score': [0.51797974, 0.5159764]},
 {'training_term': [datetime.date(2014, 6, 15), datetime.date(2015, 6, 15)],
  'boost_rounds': [13, 22],
  'best_score': [0.5162688, 0.52020633]},
 {'training_term': [datetime.date(2014, 8, 24), datetime.date(2015, 8, 24)],
  'boost_rounds': [25, 22],
  'best_score': [0.5154004, 0.5242272]},
 {'training_term': [datetime.date(2014, 6, 1), datetime.date(2015, 6, 1)],
  'boost_rounds': [18, 9],
  'best_score': [0.5121738, 0.51498735]},
 {'training_term': [datetime.date(2014, 5, 4), datetime.date(2015, 5, 4)],
  'boost_rounds': [18, 21],
  'best_score': [0.50697416, 0.49536395]},
 {'training_term': [datetime.date(2014, 9, 21), datetime.date(2015, 9, 21)],
  'boost_rounds': [4, 4],
  'best_score': [0.5065333, 0.5124978]},
 {'training_term': [datetime.date(2014, 4, 20), datetime.date(2015, 4, 20)],
  'boost_rounds': [27, 74],
  'best_score': [0.50608236, 0.52521515]},
 {'training_term': [datetime.date(2014, 5, 18), datetime.date(2015, 5, 18)],
  'boost_rounds': [18, 29],
  'best_score': [0.5036928, 0.5027692]},
 {'training_term': [datetime.date(2014, 9, 7), datetime.date(2015, 9, 7)],
  'boost_rounds': [6, 15],
  'best_score': [0.5001438, 0.50789195]}]

In [13]:
def build_models_without_validation(model_params, train_data, boost_rounds):
    """
    トレーニングデータとブースト回数を指定してモデルを構築する。
    """
    models = []
    #global config_model
    for _param, _boost_round in zip(model_params, boost_rounds):
        print("building model...")
        # 注意！ ブースティング回数はデフォルト状態では1000回になっている！
        _param['num_iteration'] = _boost_round
        _model = lgb.train(_param, train_data,
                          num_boost_round=_boost_round)
        models.append(_model)
        print("model was built.")
    return models

In [14]:
# モデルを格納する
model_house = []
for _config in ensemble_model_config[0:10]:
    _idx_training = market_train_df[(market_train_df['time'] >= _config['training_term'][0])
                                    & (market_train_df['time'] <= _config['training_term'][1])].index
    _X_train, _up_train = get_training_data(market_train_df, _idx_training, feature_cols)
    _train_data = lgb.Dataset(_X_train, label=_up_train.astype(int), free_raw_data=False)
    _models = build_models_without_validation(model_params=config_model['model_params'], 
                                              train_data=_train_data,
                                              boost_rounds=_config['boost_rounds'])
    for _each_model in _models:
        model_house.append({"model":_each_model, "post_process_config":{}})

building model...




model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.
building model...
model was built.


In [15]:
preds = []
for _each_model in model_house:
    print("predicting ...")
    preds.append(_each_model['model'].predict(X_valid))


predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...
predicting ...


In [16]:
# 上位スコアのものに限定すると...
total_preds = None
selected_preds = preds[0:5]
for _preds in selected_preds:
    if total_preds is None:
        total_preds = _preds.copy()
    else:
        total_preds += _preds

# 確信度
confidence = 2 * (total_preds/len(selected_preds)) - 1

In [17]:
r_test = validation_data.params['return']
x_t_i = confidence * validation_data.params['return'] * validation_data.params['universe']
data = {'day' : validation_data.params['time'], 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_test = mean / std
print(score_test)

0.6984079240967279


```
[post-process][sigmoid:60] sig_score=0.73060537509618
[post-process][sigmoid:61] sig_score=0.7306197948191054
[post-process][sigmoid:62] sig_score=0.7306298801065034
[post-process][sigmoid:63] sig_score=0.730635930242216
```

In [18]:
for _sig_coeff in range(1,1000):
    confidence_sig = (2/(1 + np.exp(-1*_sig_coeff * confidence))-1.0)
    x_t_i = confidence_sig * validation_data.params['return'] * validation_data.params['universe']
    data = {'day' : validation_data.params['time'], 'x_t_i' : x_t_i}
    df = pd.DataFrame(data)
    x_t = df.groupby('day').sum().values.flatten()
    mean = np.mean(x_t)
    std = np.std(x_t)
    score_test = mean / std
    print("[post-process][sigmoid:{}] sig_score={}".format(_sig_coeff, score_test))


[post-process][sigmoid:1] sig_score=0.6985474674278646
[post-process][sigmoid:2] sig_score=0.6989569766129292
[post-process][sigmoid:3] sig_score=0.6996111235571613
[post-process][sigmoid:4] sig_score=0.7004734859209296
[post-process][sigmoid:5] sig_score=0.7015025674176618
[post-process][sigmoid:6] sig_score=0.7026568629760874
[post-process][sigmoid:7] sig_score=0.7038981871664353
[post-process][sigmoid:8] sig_score=0.705193370242673
[post-process][sigmoid:9] sig_score=0.7065147871067222
[post-process][sigmoid:10] sig_score=0.7078401852473872
[post-process][sigmoid:11] sig_score=0.7091521527202775
[post-process][sigmoid:12] sig_score=0.7104374419575148
[post-process][sigmoid:13] sig_score=0.7116862743653029
[post-process][sigmoid:14] sig_score=0.712891692082948
[post-process][sigmoid:15] sig_score=0.7140489873690683
[post-process][sigmoid:16] sig_score=0.7151552189091867
[post-process][sigmoid:17] sig_score=0.7162088125561822
[post-process][sigmoid:18] sig_score=0.717209238125506
[pos

[post-process][sigmoid:149] sig_score=0.7280611759459388
[post-process][sigmoid:150] sig_score=0.7280361620857858
[post-process][sigmoid:151] sig_score=0.7280113979905842
[post-process][sigmoid:152] sig_score=0.727986881052015
[post-process][sigmoid:153] sig_score=0.7279626086469908
[post-process][sigmoid:154] sig_score=0.727938578141672
[post-process][sigmoid:155] sig_score=0.7279147868951894
[post-process][sigmoid:156] sig_score=0.7278912322630869
[post-process][sigmoid:157] sig_score=0.7278679116005075
[post-process][sigmoid:158] sig_score=0.7278448222651298
[post-process][sigmoid:159] sig_score=0.7278219616198802
[post-process][sigmoid:160] sig_score=0.7277993270354262
[post-process][sigmoid:161] sig_score=0.727776915892469
[post-process][sigmoid:162] sig_score=0.7277547255838474
[post-process][sigmoid:163] sig_score=0.7277327535164637
[post-process][sigmoid:164] sig_score=0.7277109971130423
[post-process][sigmoid:165] sig_score=0.7276894538137354
[post-process][sigmoid:166] sig_sc

KeyboardInterrupt: 

In [None]:
#1個目のデータと2個目のデータの平均出してる
confidence_test = (gbm_1.predict(X_test) + gbm_2.predict(X_test))/2
confidence_test

In [None]:
#予測値の%　割合の算出
confidence_test = (confidence_test-confidence_test.min())/(confidence_test.max()-confidence_test.min())
confidence_test

In [None]:
#%に二乗して-1????
confidence_test = confidence_test*2-1
print(max(confidence_test),min(confidence_test))
confidence_test

In [None]:
# 最終スコアの計算に使用される実際のメトリックの計算
r_test = r_test.clip(-1,1) # -1～１以外の値を取り除く　彼らはどこから来たのかという
#学習の推測地と予測前の目的変数とユニバース値をかけてる？
x_t_i = confidence_test * r_test * u_test
#日付とスコア値だけのデータフレーム作り
data = {'day' : d_test, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
#日付でグループバイ（集約）　で多次元配列を１次元に直してる
x_t = df.groupby('day').sum().values.flatten()
#スコアの平均値
mean = np.mean(x_t)
#スコアの標準偏差
std = np.std(x_t)
#変動係数の逆数
score_test = mean / std
print(score_test)

In [None]:
import gc
del X_train,X_test
gc.collect()

In [None]:
#prediction
days = env.get_prediction_days()
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
total_market_obs_df = []
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if (n_days%50==0):
        print(n_days,end=' ')
    t = time.time()
    market_obs_df['time'] = market_obs_df['time'].dt.date
    
    return_features = ['returnsClosePrevMktres10','returnsClosePrevRaw10','open','close']
    total_market_obs_df.append(market_obs_df)
    if len(total_market_obs_df)==1:
        history_df = total_market_obs_df[0]
    else:
        history_df = pd.concat(total_market_obs_df[-(np.max(n_lag)+1):])
    print(history_df)
    
    new_df = generate_lag_features(history_df,n_lag=[3,7,14])
    market_obs_df = pd.merge(market_obs_df,new_df,how='left',on=['time','assetCode'])
    
#     return_features = ['open']
#     new_df = generate_lag_features(market_obs_df,n_lag=[3,7,14])
#     market_obs_df = pd.merge(market_obs_df,new_df,how='left',on=['time','assetCode'])
    
    market_obs_df = mis_impute(market_obs_df)
    
    market_obs_df = data_prep(market_obs_df)
    
#     market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    
    X_live = market_obs_df[fcol].values
    X_live = 1 - ((maxs - X_live) / rng)
    prep_time += time.time() - t
    
    t = time.time()
    lp = (gbm_1.predict(X_live) + gbm_2.predict(X_live))/2
    prediction_time += time.time() -t
    
    t = time.time()
    
    confidence = lp
    confidence = (confidence-confidence.min())/(confidence.max()-confidence.min())
    confidence = confidence * 2 - 1
    
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t
    
env.write_submission_file()
sub  = pd.read_csv("submission.csv")