<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#作戦メモ" data-toc-modified-id="作戦メモ-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>作戦メモ</a></span></li><li><span><a href="#前処理" data-toc-modified-id="前処理-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>前処理</a></span></li><li><span><a href="#優秀なトレーニング期間の探索" data-toc-modified-id="優秀なトレーニング期間の探索-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>優秀なトレーニング期間の探索</a></span></li></ul></div>

# 作戦メモ
トレーニング期間を変えながら、性能が良いモデルを構築できる期間を探し出す。  
そして、それらの期間ごとにモデルを構築し、最後にアンサンブルする。  

このノートブックは作戦探索用である。
提出用ノートブックは別に用意すべし。

In [1]:
import numpy as np
import lightgbm as lgb
import pandas as pd
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import random
from datetime import datetime, date
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import time
from joblib import Parallel, delayed
from tqdm import tqdm
from dateutil.parser import parse
import gc

import datetime
from dateutil.relativedelta import relativedelta
from IPython import embed

In [2]:
#データ取得コマンド
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
print('Done!')

Loading the data... This could take a minute.
Done!
Done!


In [3]:
market_train_df = env._var07
news_train_df = env._var10

In [4]:
config_model = {"n_lag":[3,7,14], # 時系列特徴量計算の周期リスト
                
                # 時系列特徴量計算のリスト
                "return_features" : ['returnsClosePrevMktres10','returnsClosePrevRaw10','open','close'],
                
                "validation_term" : ['2017-01-01', '2018-12-31'],
                
                "model_params": [{ # パラメータ１
                                    'task': 'train',
                                    'boosting_type': 'gbdt',
                                    'objective': 'binary',
                                    'learning_rate': 0.05,
                                    'num_leaves': 2400,
                                    'min_data_in_leaf': 150,
                                    'num_iteration': 1000,
                                    'max_bin': 200,
                                    'verbose': 1,
                                    'metric' : 'None'  # カスタム評価関数のみ使う
                                },
                                { # パラメータ２
                                    'task': 'train',
                                    'boosting_type': 'gbdt',
                                    'objective': 'binary',
                                    'learning_rate': 0.048,
                                    'num_leaves': 2300,
                                    'min_data_in_leaf': 150,
                                    'num_iteration': 1000,
                                    'max_bin': 200,
                                    'verbose': 1,
                                    'metric' : 'None'  # カスタム評価関数のみ使う
                                }]
               }


# 前処理

In [5]:
market_train_df['time'] = market_train_df['time'].dt.date

In [6]:
def create_lag(df_code,n_lag, return_features):

    #df_codeの資産コードを重複なしでcodeに入れる
    code = df_code['assetCode'].unique()
    
    # データフレーム高速生成のための辞書
    _dict_for_df = {"time": df_code["time"].values,
                    "assetCode": df_code["assetCode"].values}
    shift_size = 1
    for col in return_features:
        for window in n_lag:
            rolled = df_code[col].shift(shift_size).rolling(window=window)
            #移動平均の平均
            lag_mean = rolled.mean()
            #最大値
            lag_max = rolled.max()
            #最小値
            lag_min = rolled.min()
            #標準偏差
            lag_std = rolled.std()
            #colの値_lag_windowの値_meanのカラム名でdf_codeに格納される
            _dict_for_df['%s_lag_%s_mean'%(col,window)] = lag_mean
            _dict_for_df['%s_lag_%s_max'%(col,window)] = lag_max
            _dict_for_df['%s_lag_%s_min'%(col,window)] = lag_min
    df_result = pd.DataFrame.from_dict(_dict_for_df)
    del df_code
    return df_result.fillna(-1)


def generate_lag_features(df,n_lag, return_features):
    all_df = Parallel(n_jobs=-1)(tqdm([delayed(create_lag)(_df, n_lag, return_features) for _code, _df in df.groupby('assetCode')]))
    new_df = pd.concat(all_df)
    del df
    gc.collect()
    return new_df

def data_prep(market_train):
    lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
    market_train['assetCodeT'] = market_train['assetCode'].map(lbl)
    market_train = market_train.dropna(axis=0)
    return market_train

def mis_impute(data):
    #カラムを一つずつ引っ張ってくる
    for i in data.columns:
        #型が文字列なら空値をotherに置き換え
        if data[i].dtype == "object":
            data[i] = data[i].fillna("other")
        #型が数字なら空値は平均値に置き換え
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data

def get_feature_cols(df):
    """
    特徴量とするカラムを取得する。
    """
    fcol = [c for c in df.columns if c not in ['assetCode', 'assetCodes', 'assetCodesLen', 'assetName', 'audiences', 
                                             'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'time_x','provider', 
                                             'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'universe','sourceTimestamp']]
    return fcol

In [7]:
new_df = generate_lag_features(market_train_df,
                               n_lag=config_model['n_lag'], 
                               return_features=config_model['return_features'])
market_train_df = pd.merge(market_train_df,new_df,how='left',on=['time','assetCode'], copy=False)
market_train_df = mis_impute(market_train_df)
market_train_df = data_prep(market_train_df)

#up = market_train_df['returnsOpenNextMktres10'] >= 0

#universe(これが１以外は訓練データとして使えない)の値をuniverse変数に放り込む
#でもなんで値？
#universe = market_train_df['universe'].values


100%|██████████| 4125/4125 [00:58<00:00, 70.44it/s]


In [8]:
# メモリ節約
for _ in market_train_df.columns:
    if market_train_df[_].dtype == np.float64:
        market_train_df[_] = market_train_df[_].astype(np.float32)
gc.collect()

14

# 優秀なトレーニング期間の探索
学習期間をスライドさせながら、バリデーション期間における性能を測定し、記録する。  
そして、優秀な性能を誇る期間をリストアップする。

In [9]:
# 各学習期間の性能測定結果を記録する。
# 各要素は、以下を記録する。
# "term":学習期間, "sigma_score":スコア, "post_process_param": ポストプロセスパラメータ(シグモイド係数)
performance_records = []

# 特徴量カラム
feature_cols = get_feature_cols(market_train_df)

# インデックス情報
idx_validate = market_train_df[(market_train_df['time'] >= parse(config_model['validation_term'][0]).date())
                               & (market_train_df['time'] <= parse(config_model['validation_term'][1]).date())].index

In [10]:
# バリデーションデータの用意
X_valid = market_train_df.loc[idx_validate][feature_cols].values
up_valid = market_train_df.loc[idx_validate]['returnsOpenNextMktres10'] > 0
validation_data = lgb.Dataset(X_valid, label=up_valid.astype(int),free_raw_data=False,
                             params={"return":market_train_df.loc[idx_validate]['returnsOpenNextMktres10'].values, 
                                     "time":market_train_df.loc[idx_validate]['time'].values,
                                     "universe": market_train_df.loc[idx_validate]['universe'].values})
gc.collect()

0

In [11]:
pos_training_start = parse('2012-01-01')
training_term_range = relativedelta(months=12)  # 学習期間は1年間
training_term_step = relativedelta(weeks=2)
gc.collect()

0

In [12]:
def get_training_data(market_train_df, idx_training, feature_cols):
    """
    トレーニングデータを取得する
    """
    return ( market_train_df.loc[idx_training][feature_cols].values,   # X_train
             market_train_df.loc[idx_training]['returnsOpenNextMktres10'] > 0, # up_train
           )

def eval_sigma_score(preds, valid_set):
    """
    予測とバリデーションセットを受け取り、sigma_scoreを計算する。
    """
    _df_valid = pd.DataFrame.from_dict({"time":valid_set.params['time'],
                                       "x_t_i": (2 * preds - 1) * valid_set.params['universe'] * valid_set.params['return']})
    x_t = _df_valid.groupby("time").agg({"x_t_i":"sum"}).values.flatten()
    #embed()
    _mean = x_t.mean()
    _std = x_t.std()
    return ('sigma_score', np.float32(_mean/_std), True)

def build_models(model_params, train_data, validation_data):
    """
    トレーニングデータとバリデーションデータを受け取り、モデルを構築する。
    """
    models = []
    #global config_model
    for _param in model_params:
        _evals_result = {}
        _model = lgb.train(_param, train_data,
                          num_boost_round=200,
                          valid_sets=validation_data,
                          early_stopping_rounds=10,
                          feval=eval_sigma_score,
                          evals_result=_evals_result)
        # 最適ブースティング情報を得る
        _best_boosting_round = _model.best_iteration
        _best_sigma_score = np.max(_evals_result['valid_0']['sigma_score'])
        models.append([_model, _best_boosting_round, _best_sigma_score, _evals_result])
    return models

In [None]:
while pos_training_start + training_term_range < parse(config_model['validation_term'][0]):
    # 探索処理
    _training_term = [pos_training_start.date(), (pos_training_start + training_term_range).date()]
    print("[Training Term] {} - {}".format(_training_term[0], _training_term[1]))
    
    _idx_training = market_train_df[(market_train_df['time'] >= _training_term[0])
                                    & (market_train_df['time'] <= _training_term[1])].index
    _X_train, _up_train = get_training_data(market_train_df, _idx_training, feature_cols)
    _train_data = lgb.Dataset(_X_train, label=_up_train.astype(int), free_raw_data=False)
    performance_records.append({"training_term":_training_term,
                                "results": build_models(model_params=config_model['model_params'], train_data=_train_data, validation_data=validation_data)})
    
    # サマリを表示
    _latest = performance_records[-1]
    print("=" * 80)
    print("[Training term] {} - {}".format(_latest['training_term'][0], _latest['training_term'][1]))
    for _r in _latest['results']:
        print("Best Sigma Score: {} / boost_rounds = {}".format(_r[2], _r[1]))
    print("=" * 80)
    
    pos_training_start = pos_training_start + training_term_step
    

In [None]:
#1個目のデータと2個目のデータの平均出してる
confidence_test = (gbm_1.predict(X_test) + gbm_2.predict(X_test))/2
confidence_test

In [None]:
#予測値の%　割合の算出
confidence_test = (confidence_test-confidence_test.min())/(confidence_test.max()-confidence_test.min())
confidence_test

In [None]:
#%に二乗して-1????
confidence_test = confidence_test*2-1
print(max(confidence_test),min(confidence_test))
confidence_test

In [None]:
# 最終スコアの計算に使用される実際のメトリックの計算
r_test = r_test.clip(-1,1) # -1～１以外の値を取り除く　彼らはどこから来たのかという
#学習の推測地と予測前の目的変数とユニバース値をかけてる？
x_t_i = confidence_test * r_test * u_test
#日付とスコア値だけのデータフレーム作り
data = {'day' : d_test, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
#日付でグループバイ（集約）　で多次元配列を１次元に直してる
x_t = df.groupby('day').sum().values.flatten()
#スコアの平均値
mean = np.mean(x_t)
#スコアの標準偏差
std = np.std(x_t)
#変動係数の逆数
score_test = mean / std
print(score_test)

In [None]:
import gc
del X_train,X_test
gc.collect()

In [None]:
#prediction
days = env.get_prediction_days()
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
total_market_obs_df = []
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if (n_days%50==0):
        print(n_days,end=' ')
    t = time.time()
    market_obs_df['time'] = market_obs_df['time'].dt.date
    
    return_features = ['returnsClosePrevMktres10','returnsClosePrevRaw10','open','close']
    total_market_obs_df.append(market_obs_df)
    if len(total_market_obs_df)==1:
        history_df = total_market_obs_df[0]
    else:
        history_df = pd.concat(total_market_obs_df[-(np.max(n_lag)+1):])
    print(history_df)
    
    new_df = generate_lag_features(history_df,n_lag=[3,7,14])
    market_obs_df = pd.merge(market_obs_df,new_df,how='left',on=['time','assetCode'])
    
#     return_features = ['open']
#     new_df = generate_lag_features(market_obs_df,n_lag=[3,7,14])
#     market_obs_df = pd.merge(market_obs_df,new_df,how='left',on=['time','assetCode'])
    
    market_obs_df = mis_impute(market_obs_df)
    
    market_obs_df = data_prep(market_obs_df)
    
#     market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    
    X_live = market_obs_df[fcol].values
    X_live = 1 - ((maxs - X_live) / rng)
    prep_time += time.time() - t
    
    t = time.time()
    lp = (gbm_1.predict(X_live) + gbm_2.predict(X_live))/2
    prediction_time += time.time() -t
    
    t = time.time()
    
    confidence = lp
    confidence = (confidence-confidence.min())/(confidence.max()-confidence.min())
    confidence = confidence * 2 - 1
    
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t
    
env.write_submission_file()
sub  = pd.read_csv("submission.csv")