In [751]:
# import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import datetime
import warnings
warnings.simplefilter('ignore')

In [752]:
# read data
train = pd.read_csv('train_finance.csv')
test = pd.read_csv('test_finance.csv')
submit = pd.read_csv('submit_finance.csv')

In [753]:
train.head()

Unnamed: 0,Date,High,Low,Open,Close
0,2003-12-01,111.370639,110.482062,111.314347,110.676377
1,2003-12-02,112.008532,110.500532,110.639316,111.823016
2,2003-12-03,112.202814,111.712284,111.832477,112.137524
3,2003-12-04,112.332259,111.406851,112.110343,111.748717
4,2003-12-05,112.766375,111.609834,111.721723,112.729259


In [754]:
test.head()

Unnamed: 0,Date,High,Low,Open,Close
0,2011-12-15,120.703596,119.907182,120.13299,120.139231
1,2011-12-16,121.027367,120.314249,120.527453,120.524311
2,2011-12-19,120.629617,120.111145,120.582444,120.591872
3,2011-12-20,121.470779,120.221987,120.323633,120.322078
4,2011-12-21,122.04615,120.513328,121.063789,121.063789


In [755]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2074 entries, 0 to 2073
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2074 non-null   object 
 1   High    2074 non-null   float64
 2   Low     2074 non-null   float64
 3   Open    2074 non-null   float64
 4   Close   2074 non-null   float64
dtypes: float64(4), object(1)
memory usage: 81.1+ KB


In [756]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1246 non-null   object 
 1   High    1246 non-null   float64
 2   Low     1246 non-null   float64
 3   Open    1246 non-null   float64
 4   Close   1246 non-null   float64
dtypes: float64(4), object(1)
memory usage: 48.8+ KB


In [757]:
test.isnull().sum()

Date     0
High     0
Low      0
Open     0
Close    0
dtype: int64

In [758]:
train.describe()

Unnamed: 0,High,Low,Open,Close
count,2074.0,2074.0,2074.0,2074.0
mean,124.078238,122.910559,123.526205,123.52949
std,9.210193,9.493319,9.188003,9.18497
min,108.520946,6.743457,107.993884,107.998923
25%,117.002707,115.863246,116.408944,116.42799
50%,123.242134,121.951219,122.590435,122.573382
75%,130.814157,129.393252,130.201913,130.17993
max,148.260942,146.834717,147.832063,147.888796


In [759]:
test.describe()

Unnamed: 0,High,Low,Open,Close
count,1246.0,1246.0,1246.0,1246.0
mean,111.424603,110.675657,111.069958,111.070373
std,9.206534,9.270658,9.215541,9.214999
min,96.767439,70.352906,96.364197,96.357173
25%,103.722807,103.064503,103.422443,103.422154
50%,108.456692,107.796669,108.091687,108.082215
75%,120.690221,119.751722,120.218867,120.219641
max,129.417693,128.691531,128.851618,128.858797


In [760]:
train.corr()

Unnamed: 0,High,Low,Open,Close
High,1.0,0.944731,0.9983,0.998288
Low,0.944731,1.0,0.945923,0.945973
Open,0.9983,0.945923,1.0,0.996934
Close,0.998288,0.945973,0.996934,1.0


In [761]:
# targetの作成
df_diff = train['Open'].shift(-1) - train['Close']
df_diff = df_diff.fillna(0)
train['y'] = df_diff
train.head()

Unnamed: 0,Date,High,Low,Open,Close,y
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,-0.037061
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,0.009461
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,-0.027181
3,2003-12-04,112.332259,111.406851,112.110343,111.748717,-0.026994
4,2003-12-05,112.766375,111.609834,111.721723,112.729259,-0.175559


In [762]:
train['y'] = train['y'].apply(lambda x: 1 if x > 0 else 0)
train.head()

Unnamed: 0,Date,High,Low,Open,Close,y
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,0
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0
3,2003-12-04,112.332259,111.406851,112.110343,111.748717,0
4,2003-12-05,112.766375,111.609834,111.721723,112.729259,0


In [763]:
train.loc[1114:1116]

Unnamed: 0,Date,High,Low,Open,Close,y
1114,2008-03-14,145.059359,143.736202,144.513188,144.465784,1
1115,2008-03-17,147.103265,6.743457,145.669288,145.669288,0
1116,2008-03-18,146.409411,144.456764,145.660125,144.83676,1


In [764]:
(143.736202 + 144.456764)/2

144.09648299999998

In [765]:
train.loc[1115, 'Low'] = 144.09648299999998

In [766]:
train.describe()

Unnamed: 0,High,Low,Open,Close,y
count,2074.0,2074.0,2074.0,2074.0,2074.0
mean,124.078238,122.976785,123.526205,123.52949,0.437319
std,9.210193,9.155624,9.188003,9.18497,0.496175
min,108.520946,107.748581,107.993884,107.998923,0.0
25%,117.002707,115.876669,116.408944,116.42799,0.0
50%,123.242134,121.954434,122.590435,122.573382,0.0
75%,130.814157,129.426294,130.201913,130.17993,1.0
max,148.260942,146.834717,147.832063,147.888796,1.0


In [767]:
# datetime型に変更
train['Date'] = pd.to_datetime(train['Date'], format='%Y/%m/%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y/%m/%d')
submit['Date'] = pd.to_datetime(submit['Date'], format='%Y/%m/%d')
train.head()

Unnamed: 0,Date,High,Low,Open,Close,y
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,0
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0
3,2003-12-04,112.332259,111.406851,112.110343,111.748717,0
4,2003-12-05,112.766375,111.609834,111.721723,112.729259,0


In [768]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2074 entries, 0 to 2073
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    2074 non-null   datetime64[ns]
 1   High    2074 non-null   float64       
 2   Low     2074 non-null   float64       
 3   Open    2074 non-null   float64       
 4   Close   2074 non-null   float64       
 5   y       2074 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 97.3 KB


In [769]:
# monthカラムの追加
train['Month'] = train['Date'].dt.month
test['Month'] = test['Date'].dt.month
train.head()

Unnamed: 0,Date,High,Low,Open,Close,y,Month
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,0,12
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1,12
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0,12
3,2003-12-04,112.332259,111.406851,112.110343,111.748717,0,12
4,2003-12-05,112.766375,111.609834,111.721723,112.729259,0,12


In [770]:
train['Month'].unique()

array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [771]:
test.head()

Unnamed: 0,Date,High,Low,Open,Close,Month
0,2011-12-15,120.703596,119.907182,120.13299,120.139231,12
1,2011-12-16,121.027367,120.314249,120.527453,120.524311,12
2,2011-12-19,120.629617,120.111145,120.582444,120.591872,12
3,2011-12-20,121.470779,120.221987,120.323633,120.322078,12
4,2011-12-21,122.04615,120.513328,121.063789,121.063789,12


In [772]:
train.set_index(keys='Date', inplace=True)
test.set_index(keys='Date', inplace=True)
submit.set_index(keys='Date', inplace=True)

In [773]:
train.head(10)

Unnamed: 0_level_0,High,Low,Open,Close,y,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-12-01,111.370639,110.482062,111.314347,110.676377,0,12
2003-12-02,112.008532,110.500532,110.639316,111.823016,1,12
2003-12-03,112.202814,111.712284,111.832477,112.137524,0,12
2003-12-04,112.332259,111.406851,112.110343,111.748717,0,12
2003-12-05,112.766375,111.609834,111.721723,112.729259,0,12
2003-12-08,113.220483,112.425105,112.5537,113.035088,1,12
2003-12-09,113.562415,112.831025,113.044748,113.312017,0,12
2003-12-10,113.460736,112.498941,113.303692,112.766375,0,12
2003-12-11,113.173376,112.137524,112.766375,113.072392,1,12
2003-12-12,113.830743,112.970206,113.100037,113.580543,0,12


In [774]:
train['y'].value_counts()

0    1167
1     907
Name: y, dtype: int64

In [775]:
# 標準化
scaler = StandardScaler()
train.iloc[0:6][['Open', 'High', 'Low', 'Close']] = scaler.fit_transform(train.iloc[0:6][['Open', 'High', 'Low', 'Close']])

repeat_time = int(len(train)/6)
for i in range(repeat_time):
    start = i * 6
    end = start + 6
    scaler = StandardScaler()
    train.iloc[start: end][['Open', 'High', 'Low', 'Close']] = scaler.fit_transform(train.iloc[start: end][['Open', 'High', 'Low', 'Close']])

In [776]:
test.iloc[0:6][['Open', 'High', 'Low', 'Close']] = scaler.fit_transform(test.iloc[0:6][['Open', 'High', 'Low', 'Close']])

repeat_time = int(len(test)/6)
for i in range(repeat_time):
    start = i * 6
    end = start + 6
    scaler = StandardScaler()
    test.iloc[start: end][['Open', 'High', 'Low', 'Close']] = scaler.fit_transform(test.iloc[start: end][['Open', 'High', 'Low', 'Close']])

In [777]:
train.head()

Unnamed: 0_level_0,High,Low,Open,Close,y,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-12-01,-1.629848,-1.272108,-0.630752,-1.774696,0,12
2003-12-02,-0.531078,-1.245227,-1.748364,-0.265794,1,12
2003-12-03,-0.196427,0.518381,0.227087,0.148079,0,12
2003-12-04,0.026541,0.073848,0.687134,-0.363566,0,12
2003-12-05,0.774306,0.369273,0.043718,0.926762,0,12


In [778]:
test.head()

Unnamed: 0_level_0,High,Low,Open,Close,Month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-12-15,-1.032581,-1.68098,-1.415698,-1.412396,12
2011-12-16,-0.365541,0.280997,-0.038916,-0.067296,12
2011-12-19,-1.184994,-0.697919,0.153016,0.168699,12
2011-12-20,0.547987,-0.163684,-0.750303,-0.773703,12
2011-12-21,1.733379,1.240515,1.833034,1.817123,12


In [779]:
train['Body'] = train['Open'] - train['Close']
test['Body'] = test['Open'] - test['Close']

In [780]:
train

Unnamed: 0_level_0,High,Low,Open,Close,y,Month,Body
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-12-01,-1.629848,-1.272108,-0.630752,-1.774696,0,12,1.143944
2003-12-02,-0.531078,-1.245227,-1.748364,-0.265794,1,12,-1.482570
2003-12-03,-0.196427,0.518381,0.227087,0.148079,0,12,0.079009
2003-12-04,0.026541,0.073848,0.687134,-0.363566,0,12,1.050701
2003-12-05,0.774306,0.369273,0.043718,0.926762,0,12,-0.883044
...,...,...,...,...,...,...,...
2011-12-08,-0.572820,-1.516008,-0.728701,-0.693250,0,12,-0.035451
2011-12-09,124.227770,122.895824,123.445260,123.445260,1,12,0.000000
2011-12-12,123.745819,122.202555,123.681290,123.687906,0,12,-0.006616
2011-12-13,122.393352,121.174807,121.782641,121.832361,0,12,-0.049720


In [781]:
train = pd.get_dummies(train, drop_first=True, columns=['Month'])
test = pd.get_dummies(test, drop_first=True, columns=['Month'])

In [782]:
train.head()

Unnamed: 0_level_0,High,Low,Open,Close,y,Body,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2003-12-01,-1.629848,-1.272108,-0.630752,-1.774696,0,1.143944,0,0,0,0,0,0,0,0,0,0,1
2003-12-02,-0.531078,-1.245227,-1.748364,-0.265794,1,-1.48257,0,0,0,0,0,0,0,0,0,0,1
2003-12-03,-0.196427,0.518381,0.227087,0.148079,0,0.079009,0,0,0,0,0,0,0,0,0,0,1
2003-12-04,0.026541,0.073848,0.687134,-0.363566,0,1.050701,0,0,0,0,0,0,0,0,0,0,1
2003-12-05,0.774306,0.369273,0.043718,0.926762,0,-0.883044,0,0,0,0,0,0,0,0,0,0,1


In [783]:
test.head()

Unnamed: 0_level_0,High,Low,Open,Close,Body,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-12-15,-1.032581,-1.68098,-1.415698,-1.412396,-0.003302,0,0,0,0,0,0,0,0,0,0,1
2011-12-16,-0.365541,0.280997,-0.038916,-0.067296,0.02838,0,0,0,0,0,0,0,0,0,0,1
2011-12-19,-1.184994,-0.697919,0.153016,0.168699,-0.015683,0,0,0,0,0,0,0,0,0,0,1
2011-12-20,0.547987,-0.163684,-0.750303,-0.773703,0.0234,0,0,0,0,0,0,0,0,0,0,1
2011-12-21,1.733379,1.240515,1.833034,1.817123,0.015911,0,0,0,0,0,0,0,0,0,0,1


In [784]:
# 学習用
x = train.drop('y', axis=1)
y = train['y']

In [785]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=False)

In [786]:
model = lgb.LGBMClassifier(boosting_type='dart', colsample_bytree=1.0, importance_type='split', 
                           learning_rate=0.05, max_depth=4, min_child_samples=20, 
                           min_child_weight=0.001, n_estimators=150, n_jobs=-1, num_leaves=30,
                           objective='binary', random_state=0, silent=True, subsample=1.0)

In [787]:
model.fit(x_train, y_train)
pred_check = model.predict(x_test)

In [788]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_check)

0.5452793834296724

In [789]:
model.fit(x, y)

LGBMClassifier(boosting_type='dart', learning_rate=0.05, max_depth=4,
               n_estimators=150, num_leaves=30, objective='binary',
               random_state=0)

In [790]:
pred_all = model.predict(test)

In [791]:
test['y'] = pred_all
test

Unnamed: 0_level_0,High,Low,Open,Close,Body,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-12-15,-1.032581,-1.680980,-1.415698,-1.412396,-0.003302,0,0,0,0,0,0,0,0,0,0,1,1
2011-12-16,-0.365541,0.280997,-0.038916,-0.067296,0.028380,0,0,0,0,0,0,0,0,0,0,1,0
2011-12-19,-1.184994,-0.697919,0.153016,0.168699,-0.015683,0,0,0,0,0,0,0,0,0,0,1,0
2011-12-20,0.547987,-0.163684,-0.750303,-0.773703,0.023400,0,0,0,0,0,0,0,0,0,0,1,1
2011-12-21,1.733379,1.240515,1.833034,1.817123,0.015911,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-19,-1.241754,-1.035678,-1.150752,-1.141032,-0.009720,0,0,0,0,0,0,0,0,0,0,1,1
2019-12-26,102.720708,102.515796,102.629538,102.632956,-0.003418,0,0,0,0,0,0,0,0,0,0,1,0
2019-12-27,103.421288,102.674990,102.686502,102.685356,0.001147,0,0,0,0,0,0,0,0,0,0,1,0
2019-12-30,103.791471,103.429382,103.429382,103.436318,-0.006936,0,0,0,0,0,0,0,0,0,0,1,0


In [792]:
# pred_all[5::6]
# submit['y'] = pred_all[5::6]

In [793]:
submit

2011-12-22
2012-01-05
2012-01-19
2012-02-02
2012-02-16
...
2019-10-23
2019-11-07
2019-11-21
2019-12-05
2019-12-19


In [794]:
submit = submit.join(test['y'])
submit

Unnamed: 0_level_0,y
Date,Unnamed: 1_level_1
2011-12-22,0
2012-01-05,0
2012-01-19,0
2012-02-02,0
2012-02-16,1
...,...
2019-10-23,1
2019-11-07,1
2019-11-21,0
2019-12-05,0


In [795]:
import datetime
now = datetime.datetime.now()
timestamp = now.strftime('%m%d_%H-%M-%S')
submit.to_csv(f'all_{timestamp}.csv')