In [1]:
#インポートとデータ取得
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import warnings
warnings.simplefilter('ignore')

sns.set(font="IPAexGothic",style="white")
df_train1=pd.read_csv("train_finance.csv")
df_test1=pd.read_csv("test_finance.csv")
df_submit = pd.read_csv("submit_finance.csv", index_col=0)

In [2]:
#データの確認
df_train1.describe()

Unnamed: 0,High,Low,Open,Close
count,2074.0,2074.0,2074.0,2074.0
mean,124.078238,122.910559,123.526205,123.52949
std,9.210193,9.493319,9.188003,9.18497
min,108.520946,6.743457,107.993884,107.998923
25%,117.002707,115.863246,116.408944,116.42799
50%,123.242134,121.951219,122.590435,122.573382
75%,130.814157,129.393252,130.201913,130.17993
max,148.260942,146.834717,147.832063,147.888796


In [3]:
df_test1.describe()

Unnamed: 0,High,Low,Open,Close
count,1246.0,1246.0,1246.0,1246.0
mean,111.424603,110.675657,111.069958,111.070373
std,9.206534,9.270658,9.215541,9.214999
min,96.767439,70.352906,96.364197,96.357173
25%,103.722807,103.064503,103.422443,103.422154
50%,108.456692,107.796669,108.091687,108.082215
75%,120.690221,119.751722,120.218867,120.219641
max,129.417693,128.691531,128.851618,128.858797


In [4]:
df_test1

Unnamed: 0,Date,High,Low,Open,Close
0,2011-12-15,120.703596,119.907182,120.132990,120.139231
1,2011-12-16,121.027367,120.314249,120.527453,120.524311
2,2011-12-19,120.629617,120.111145,120.582444,120.591872
3,2011-12-20,121.470779,120.221987,120.323633,120.322078
4,2011-12-21,122.046150,120.513328,121.063789,121.063789
...,...,...,...,...,...
1241,2019-12-19,103.067513,102.766359,102.822331,102.822331
1242,2019-12-26,102.720708,102.515796,102.629538,102.632956
1243,2019-12-27,103.421288,102.674990,102.686502,102.685356
1244,2019-12-30,103.791471,103.429382,103.429382,103.436318


## 訓練用データの編集

In [5]:
#テストデータの先頭5行目までを抽出
df_testH = df_test1.drop(range(5,1246), axis=0)
df_testH

Unnamed: 0,Date,High,Low,Open,Close
0,2011-12-15,120.703596,119.907182,120.13299,120.139231
1,2011-12-16,121.027367,120.314249,120.527453,120.524311
2,2011-12-19,120.629617,120.111145,120.582444,120.591872
3,2011-12-20,121.470779,120.221987,120.323633,120.322078
4,2011-12-21,122.04615,120.513328,121.063789,121.063789


In [6]:
#訓練用データへ組み込み (テストデータの先頭5行目はあとで削除するため)
df_train1 = pd.concat([df_train1,df_testH],ignore_index=True)

In [7]:
df_train1.loc[(df_train1['Low']) < 80, 'Low'] = df_train1['High'] - (np.average(df_train1['High']) - np.average(df_train1['Low']))

#標準化
scaler = StandardScaler()
df_train1[['High', 'Low', 'Open', 'Close']] =  scaler.fit_transform(df_train1[['High', 'Low', 'Open', 'Close']])
df_train1.head()

In [8]:
#値動き、変化率を追加
df_train1['Diff'] = df_train1['Close'].diff(1)
for num in range(1, 5):
    df_train1['Diff%d' % num] = df_train1['Diff'].shift(num)
df_train1['Change'] = df_train1['Close'].pct_change(1)

In [9]:
#移動平均
df_train1['OpenAve5'] = df_train1['Open'].rolling(window=5).mean()
df_train1['CloseAve5'] = df_train1['Close'].rolling(window=5).mean()

#5日最高安値
df_train1['Max5'] = df_train1['High'].rolling(window=5).max()
df_train1['Min5'] = df_train1['Low'].rolling(window=5).min()

#前日までの5日間の最高値/安値を当日超えたかどうか
df_train1.loc[(df_train1['High']) > (df_train1['Max5'].shift()), 'Trend'] = 1
df_train1.loc[(df_train1['Low']) < (df_train1['Min5'].shift()), 'Trend'] = -1
df_train1['Trend'].fillna(0, inplace=True)
df_train1.head(3)

Unnamed: 0,Date,High,Low,Open,Close,Diff,Diff1,Diff2,Diff3,Diff4,Change,OpenAve5,CloseAve5,Max5,Min5,Trend
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,,,,,,,,,,,0.0
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1.146639,,,,,0.01036,,,,,0.0
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0.314508,1.146639,,,,0.002813,,,,,0.0


In [10]:
#上がったか下がったか、当日～前5日までとその合計
df_train1.loc[(df_train1['Diff']) <= 0, 'HL'] = 0
df_train1.loc[(df_train1['Diff']) > 0, 'HL'] = 1
for num in range(1, 5):
    df_train1['HL%d' % num] = df_train1['HL'].shift(num)
df_train1['HL_5'] = (df_train1['HL']+df_train1['HL1']+df_train1['HL2']+df_train1['HL3']+df_train1['HL4'])
df_train1.head(3)

Unnamed: 0,Date,High,Low,Open,Close,Diff,Diff1,Diff2,Diff3,Diff4,...,CloseAve5,Max5,Min5,Trend,HL,HL1,HL2,HL3,HL4,HL_5
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,,,,,,...,,,,0.0,,,,,,
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1.146639,,,,,...,,,,0.0,1.0,,,,,
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0.314508,1.146639,,,,...,,,,0.0,1.0,1.0,,,,


In [11]:
#最大値幅と終値-始値の差、5日分
df_train1['H-L'] = df_train1['High'] - df_train1['Low']
for num in range(1, 5):
    df_train1['H-L%d' % num] = df_train1['H-L'].shift(num)
df_train1['OtoC'] = df_train1['Close'] - df_train1['Open']
for num in range(1, 5):
    df_train1['OtoC%d' % num] = df_train1['OtoC'].shift(num)

In [12]:
#翌日始値-当日終値の差(窓)、5日分
df_train1['Next_gap'] = df_train1['Open'].shift(-1) - df_train1['Close']
for num in range(0, 4):
    df_train1['gap%d' % num] = df_train1['Next_gap'].shift(num+1)

In [13]:
#終値が最高値最安値どちらに近いか、5日分
df_train1['CloseHL'] = df_train1['Close']*2-(df_train1['High'] + df_train1['Low'])
for num in range(1, 5):
    df_train1['CloseHL%d' % num] = df_train1['CloseHL'].shift(num)

In [14]:
#日を追加
df_train1['Day'] =  df_train1["Date"].str[-2:].astype(int)
#df_train1.loc[(df_train1['Day'] == 1)|
#              (df_train1['Day'] == 2), 'Day'] = 1
#df_train1.loc[(df_train1['Day'] == 29)|
#              (df_train1['Day'] == 30)|
#              (df_train1['Day'] == 31), 'Day'] = -1
#df_train1.loc[df_train1['Day'] > 2, 'Day'] = 0
df_train1.head(3)

Unnamed: 0,Date,High,Low,Open,Close,Diff,Diff1,Diff2,Diff3,Diff4,...,gap0,gap1,gap2,gap3,CloseHL,CloseHL1,CloseHL2,CloseHL3,CloseHL4,Day
0,2003-12-01,111.370639,110.482062,111.314347,110.676377,,,,,,...,,,,,-0.499947,,,,,1
1,2003-12-02,112.008532,110.500532,110.639316,111.823016,1.146639,,,,,...,-0.037061,,,,1.136968,-0.499947,,,,2
2,2003-12-03,112.202814,111.712284,111.832477,112.137524,0.314508,1.146639,,,,...,0.009461,-0.037061,,,0.35995,1.136968,-0.499947,,,3


In [15]:
#曜日列を数値で追加
df_train1["Date"] = pd.to_datetime(df_train1["Date"], format="%Y-%m-%d")
df_train1["Weekday"] = df_train1["Date"].dt.weekday

In [16]:
#翌日の始値の上下 (目的変数1)と翌日の終値の上下(目的変数2)
df_train1.loc[(df_train1['Next_gap']) <= 0, 'Next_OpenHL'] = 0
df_train1.loc[(df_train1['Next_gap']) > 0, 'Next_OpenHL'] = 1
df_train1['Next_OpenHL'].fillna(0.5, inplace=True)
df_train1['Next_OpenHL'] = df_train1['Next_OpenHL'].astype(int)

df_train1['NextHL'] = df_train1['HL'].shift(-1)

In [17]:
#欠損値を含む行の削除
df_train1 = df_train1.drop([0, 1, 2, 3, 4, 5, 2078], axis=0)
print(df_train1)

           Date        High         Low        Open       Close      Diff  \
6    2003-12-09  113.562415  112.831025  113.044748  113.312017  0.276929   
7    2003-12-10  113.460736  112.498941  113.303692  112.766375 -0.545642   
8    2003-12-11  113.173376  112.137524  112.766375  113.072392  0.306017   
9    2003-12-12  113.830743  112.970206  113.100037  113.580543  0.508151   
10   2003-12-15  114.006114  112.646899  112.729259  113.959768  0.379225   
...         ...         ...         ...         ...         ...       ...   
2073 2011-12-14  120.796608  119.824855  120.508620  120.427021 -1.405340   
2074 2011-12-15  120.703596  119.907182  120.132990  120.139231 -0.287790   
2075 2011-12-16  121.027367  120.314249  120.527453  120.524311  0.385080   
2076 2011-12-19  120.629617  120.111145  120.582444  120.591872  0.067562   
2077 2011-12-20  121.470779  120.221987  120.323633  120.322078 -0.269794   

         Diff1     Diff2     Diff3     Diff4  ...      gap3   CloseHL  \
6 

In [18]:
#分析する要素を選択
df_train2 = df_train1[["Diff", "HL1", "HL2", "HL3", "HL4", "CloseHL", "gap0", "OtoC", "Weekday", "NextHL"]]
df_train2.isnull().sum()

Diff       0
HL1        0
HL2        0
HL3        0
HL4        0
CloseHL    0
gap0       0
OtoC       0
Weekday    0
NextHL     0
dtype: int64

## 検証データの編集

In [19]:
#テストデータに同じ列の作成
df_test1.loc[(df_test1['Low']) < 90, 'Low'] = df_test1['High'] - (np.average(df_test1['High']) - np.average(df_test1['Low']))

#df_test1[['High', 'Low', 'Open', 'Close']] =  scaler.fit_transform(df_test1[['High', 'Low', 'Open', 'Close']])

df_test1['Diff'] = df_test1['Close'].diff(1)
for num in range(1, 5):
    df_test1['Diff%d' % num] = df_test1['Diff'].shift(num)
    
df_test1['Change'] = df_test1['Close'].pct_change(1)

df_test1['OpenAve5'] = df_test1['Open'].rolling(window=5).mean()
df_test1['CloseAve5'] = df_test1['Close'].rolling(window=5).mean()
df_test1['Max5'] = df_test1['High'].rolling(window=5).max()
df_test1['Min5'] = df_test1['Low'].rolling(window=5).min()
df_test1.loc[(df_test1['High']) > (df_test1['Max5'].shift()), 'Trend'] = 1
df_test1.loc[(df_test1['Low']) < (df_test1['Min5'].shift()), 'Trend'] = -1
df_test1['Trend'].fillna(0, inplace=True)

df_test1.loc[(df_test1['Diff']) <= 0, 'HL'] = 0
df_test1.loc[(df_test1['Diff']) > 0, 'HL'] = 1


for num in range(1, 5):
    df_test1['HL%d' % num] = df_test1['HL'].shift(num)
df_test1['HL_5'] = (df_test1['HL']+df_test1['HL1']+df_test1['HL2']+df_test1['HL3']+df_test1['HL4'])
df_test1.head()

df_test1['H-L'] = df_test1['High'] - df_test1['Low']
for num in range(1, 5):
    df_test1['H-L%d' % num] = df_test1['H-L'].shift(num)
    
df_test1['OtoC'] = df_test1['Close'] - df_test1['Open']
for num in range(1, 5):
    df_test1['OtoC%d' % num] = df_test1['OtoC'].shift(num)

df_test1['Next_gap'] = df_test1['Open'].shift(-1) - df_test1['Close']
for num in range(0, 4):
    df_test1['gap%d' % num] = df_test1['Next_gap'].shift(num+1)

df_test1['CloseHL'] = df_test1['Close']*2-(df_test1['High'] + df_test1['Low'])
for num in range(1, 5):
    df_test1['CloseHL%d' % num] = df_test1['CloseHL'].shift(num)

df_test1['Day'] =  df_test1["Date"].str[-2:].astype(int)
#df_test1.loc[(df_test1['Day'] == 1)|
#              (df_test1['Day'] == 2), 'Day'] = 1
#df_test1.loc[(df_test1['Day'] == 29)|
#              (df_test1['Day'] == 30)|
#              (df_test1['Day'] == 31), 'Day'] = -1
#df_test1.loc[df_test1['Day'] > 2, 'Day'] = 0

df_test1["Date"] = pd.to_datetime(df_test1["Date"], format="%Y-%m-%d")
df_test1["Weekday"] = df_test1["Date"].dt.weekday

df_test1.loc[(df_test1['Next_gap']) <= 0, 'Next_OpenHL'] = 0
df_test1.loc[(df_test1['Next_gap']) > 0, 'Next_OpenHL'] = 1
df_test1['NextHL'] = df_test1['HL'].shift(-1)

In [20]:
#訓練用データに結合用のデータを抽出(2種類) 欠損値を含む行の削除
for num in range(0, 1242, 6):
    df_testT = df_test1.drop([num], axis=0)

df_testT1 = df_testT[["NextHL", "Diff", "HL1", "HL2", "HL3", "HL4", "gap0", "CloseHL", 'Trend', 'Next_OpenHL']]
df_testT1 = df_testT1.drop([0,1,2,3,4,1245],axis=0)

df_testT = df_testT[["Diff", "HL1", "HL2", "HL3", "HL4", "CloseHL", "gap0", "OtoC", "Weekday", "NextHL"]]
df_testT = df_testT.drop([0,1,2,3,4,1245],axis=0)
print(df_testT)

          Diff  HL1  HL2  HL3  HL4   CloseHL      gap0      OtoC  Weekday  \
5    -0.443611  1.0  0.0  1.0  1.0 -0.578701 -0.462478  0.018867        3   
6    -1.043152  0.0  1.0  0.0  1.0  0.455784 -1.057068  0.013916        3   
7     0.266464  0.0  0.0  1.0  0.0  0.093342  0.230759  0.035705        4   
8     0.037271  1.0  0.0  0.0  1.0  0.311255  0.037271  0.000000        0   
9    -0.176849  1.0  1.0  0.0  0.0 -1.112456 -0.220217  0.043369        1   
...        ...  ...  ...  ...  ...       ...       ...       ...      ...   
1240  0.133260  1.0  0.0  1.0  1.0  0.369058  0.134406 -0.001147        2   
1241 -0.327939  1.0  1.0  0.0  1.0 -0.189210 -0.327939  0.000000        3   
1242 -0.189375  0.0  1.0  1.0  0.0  0.029409 -0.192794  0.003418        3   
1243  0.052400  0.0  0.0  1.0  1.0 -0.725567  0.053546 -0.001147        4   
1244  0.750962  1.0  0.0  0.0  1.0 -0.348217  0.744026  0.006936        0   

      NextHL  
5        0.0  
6        1.0  
7        1.0  
8        0.0  


In [21]:
#予測が必要な行のみに削除し、予測用データの作成1 (始値用)
for num in range(1242, 1246):
    df_test1 = df_test1.drop([num], axis=0)
for num in range(0, 1242, 6):
    df_test1 = df_test1.drop([num, (num+1), (num+2), (num+3), (num+4)], axis=0)
print(df_test1)

           Date        High         Low        Open       Close      Diff  \
5    2011-12-22  121.351259  120.467798  120.601311  120.620178 -0.443611   
11   2012-01-05  119.722504  118.263761  119.611044  119.615676 -1.152538   
17   2012-01-19  119.519852  118.816713  118.983307  119.010863  1.110383   
23   2012-02-02  122.063870  121.109760  121.838779  121.822734  0.860260   
29   2012-02-16  120.788713  120.064380  120.788713  120.787136 -0.699599   
...         ...         ...         ...         ...         ...       ...   
1217 2019-10-23  103.282714  102.618147  102.976862  102.974568  0.028648   
1223 2019-11-07  102.595387  102.101638  102.403443  102.401171 -0.035154   
1229 2019-11-21  102.642064  102.339983  102.480587  102.479451 -0.010564   
1235 2019-12-05  102.720708  102.489662  102.514660  102.518067 -0.020455   
1241 2019-12-19  103.067513  102.766359  102.822331  102.822331 -0.327939   

         Diff1     Diff2     Diff3     Diff4  ...      gap3   CloseHL  \
5 

In [22]:
#予測用データの作成２ (終値用)
df_test2 = df_test1[["Diff", "HL1", "HL2", "HL3", "HL4", "CloseHL", "gap0", "OtoC", "Weekday"]]
df_test2.isnull().sum()

Diff       0
HL1        0
HL2        0
HL3        0
HL4        0
CloseHL    0
gap0       0
OtoC       0
Weekday    0
dtype: int64

# 結合データの作成と検証

In [23]:
#訓練用データとテストデータの結合
df_trainC = pd.concat([df_train2,df_testT],ignore_index=True)
df_trainC

Unnamed: 0,Diff,HL1,HL2,HL3,HL4,CloseHL,gap0,OtoC,Weekday,NextHL
0,0.276929,1.0,1.0,0.0,1.0,0.230594,0.009660,0.267269,1,0.0
1,-0.545642,1.0,1.0,1.0,0.0,-0.426927,-0.008325,-0.537317,2,1.0
2,0.306017,0.0,1.0,1.0,1.0,0.833884,0.000000,0.306017,3,1.0
3,0.508151,1.0,0.0,1.0,1.0,0.360137,0.027644,0.480506,4,1.0
4,0.379225,1.0,1.0,0.0,1.0,1.266523,-0.851285,1.230509,0,1.0
...,...,...,...,...,...,...,...,...,...,...
3306,0.133260,1.0,0.0,1.0,1.0,0.369058,0.134406,-0.001147,2,0.0
3307,-0.327939,1.0,1.0,0.0,1.0,-0.189210,-0.327939,0.000000,3,0.0
3308,-0.189375,0.0,1.0,1.0,0.0,0.029409,-0.192794,0.003418,3,1.0
3309,0.052400,0.0,0.0,1.0,1.0,-0.725567,0.053546,-0.001147,4,1.0


In [24]:
#結合データを古い順に1行ずつ削除し、accuracyスコアが高くなる行数の探索 (翌日終値)　【ロジスティック回帰】

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

drop = ['NextHL']
train_X = df_trainC.drop(columns=drop,axis=1) 
train_y = df_trainC['NextHL']
dtc = LogisticRegression()
max_score = 0

for i in range(1,3000):
    train_X = train_X.drop(i, axis=0)
    train_y = train_y.drop(i, axis=0)
    
    X_train, X_test, y_train, y_test = train_test_split(train_X,train_y,test_size=0.3,random_state=0)
    dtc.fit(X_train, y_train)
    train_pred = dtc.predict(X_train)
    test_pred = dtc.predict(X_test)
    score = accuracy_score(y_test, test_pred)
    #score = (accuracy_score(y_train, train_pred) + accuracy_score(y_test, test_pred)) - abs(accuracy_score(y_train, train_pred)-accuracy_score(y_test, test_pred))

    if max_score + (0.0000001*i) < score:
        max_score = score
        best_i = i

print("削除行数:", best_i)
print("MAX:",max_score)

削除行数: 2984
MAX: 0.8585858585858586


In [25]:
#予測に使用するデータの作成 (終値)
df_trainC = df_trainC.drop(range(0,best_i),axis=0)
#df_trainC = df_trainC.drop(range(0,2984),axis=0)

# 翌日終値予測

In [26]:
#【ロジスティック回帰】
y = df_trainC["NextHL"]
x = df_trainC.drop(["NextHL"], axis=1)
dtc.fit(x, y)

LogisticRegression()

In [28]:
#終値予測値を始値予測用データに入力
pred_test = dtc.predict_proba(df_test2)[:, 1] 
df_test1['NextHL'] = np.round(pred_test)
df_submit1 = df_submit
df_submit1['y'] = np.round(pred_test)
df_submit1.to_csv('submit_finance_23close.csv')

# 始値予測

In [29]:
#特徴量を選択
df_train1 = df_train1[["NextHL", "Diff", "HL1", "HL2", "HL3", "HL4", "gap0", "CloseHL", 'Trend', "Next_OpenHL"]]
df_train1.isnull().sum()

NextHL         0
Diff           0
HL1            0
HL2            0
HL3            0
HL4            0
gap0           0
CloseHL        0
Trend          0
Next_OpenHL    0
dtype: int64

In [30]:
#特徴量を選択
df_test1 = df_test1[["NextHL", "Diff", "HL1", "HL2", "HL3", "HL4", "gap0", "CloseHL", 'Trend']]
df_test1.isnull().sum()

NextHL     0
Diff       0
HL1        0
HL2        0
HL3        0
HL4        0
gap0       0
CloseHL    0
Trend      0
dtype: int64

In [31]:
#結合
df_trainC1 = pd.concat([df_train1,df_testT1],ignore_index=True)
df_trainC1

Unnamed: 0,NextHL,Diff,HL1,HL2,HL3,HL4,gap0,CloseHL,Trend,Next_OpenHL
0,0.0,0.276929,1.0,1.0,0.0,1.0,0.009660,0.230594,1.0,0.0
1,1.0,-0.545642,1.0,1.0,1.0,0.0,-0.008325,-0.426927,0.0,0.0
2,1.0,0.306017,0.0,1.0,1.0,1.0,0.000000,0.833884,0.0,1.0
3,1.0,0.508151,1.0,0.0,1.0,1.0,0.027644,0.360137,1.0,0.0
4,1.0,0.379225,1.0,1.0,0.0,1.0,-0.851285,1.266523,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
3306,0.0,0.133260,1.0,0.0,1.0,1.0,0.134406,0.369058,0.0,0.0
3307,0.0,-0.327939,1.0,1.0,0.0,1.0,-0.327939,-0.189210,-1.0,0.0
3308,1.0,-0.189375,0.0,1.0,1.0,0.0,-0.192794,0.029409,-1.0,1.0
3309,1.0,0.052400,0.0,0.0,1.0,1.0,0.053546,-0.725567,1.0,1.0


In [32]:
#結合データを古い順に1行ずつ削除し、accuracyスコアが高くなる行数の探索 (翌日始値)　【ロジスティック回帰】

drop = ['Next_OpenHL', 'NextHL']
train_X = df_trainC1.drop(columns=drop,axis=1) 
train_y = df_trainC1['Next_OpenHL']
max_score = 0

for ii in range(1,3200):
    train_X = train_X.drop(ii, axis=0)
    train_y = train_y.drop(ii, axis=0)
    
    X_train, X_test, y_train, y_test = train_test_split(train_X,train_y,test_size=0.3,random_state=3)
    dtc.fit(X_train, y_train)
    train_pred = dtc.predict(X_train)
    test_pred = dtc.predict(X_test)
    score = accuracy_score(y_test, test_pred)
    #score = (accuracy_score(y_train, train_pred) + accuracy_score(y_test, test_pred)) - abs(accuracy_score(y_train, train_pred)-accuracy_score(y_test, test_pred))

    if max_score + (0.0000001*ii) < score:
        max_score = score
        best_ii = ii

print("削除行数:", best_ii)
print("MAX:",max_score)

削除行数: 2985
MAX: 0.8571428571428571


In [None]:
df_trainC1 = df_trainC1.drop(range(0,best_ii),axis=0)
df_trainC1

drop = ['Next_OpenHL']
train_X = df_trainC1.drop(columns=drop,axis=1) 
train_y = df_trainC1['Next_OpenHL']
X_train, X_test, y_train, y_test = train_test_split(train_X,train_y,test_size=0.3,random_state=0)
dtc.fit(X_train, y_train)
train_pred = dtc.predict(X_train)
test_pred = dtc.predict(X_test)
print("train:",accuracy_score(y_train, train_pred))
print("test",accuracy_score(y_test, test_pred))

# モデリング2

In [None]:
y2 = df_trainC1["Next_OpenHL"]
x2 = df_trainC1.drop(["Next_OpenHL"], axis=1)
dtc.fit(x2, y2)

In [None]:
pred = dtc.predict_proba(df_test1)[:, 1] 

from sklearn import tree

target = df_train1["Next_OpenHL"].values

features = df_train1[["NextHL", "Diff", "HL1", "HL2", "HL3", "HL4", "gap1", "CloseHL1", "Weekday", 'Trend']].values

max_depth = 9
min_samples_split = 4
my_tree = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree = my_tree.fit(features, target)

test_features = df_test1[["NextHL", "Diff", "HL1", "HL2", "HL3", "HL4", "gap1", "CloseHL1", "Weekday", 'Trend']].values
my_prediction_tree = my_tree.predict_proba(df_test1)[:, 1] 

my_prediction_tree.shape

my_prediction_tree

## 提出用データの出力

In [None]:
df_submit['y'] = np.round(pred)

In [None]:
df_submit

In [None]:
df_submit.to_csv('submit_finance_23.csv')