In [1]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from os import path
import glob
import pathlib

In [2]:
#データをダウンロードしたフォルダ
DATA_DIR = 'K:\kabu.plus\csv'

#株価一覧表
PRICE_DIR = path.join(DATA_DIR, 'japan-all-stock-prices\daily')

#投資指標データ
STOCK_DATA_DIR = path.join(DATA_DIR, 'japan-all-stock-data\daily')
#決算・財務・業務データ
FIN_RESULTS_DIR= path.join(DATA_DIR, 'japan-all-stock-financial-results\monthly')

#結合したデータを保存するフォルダ
OUT_DIR = 'D:\DEV\workspace\stock_trade\data'


In [3]:
import warnings
warnings.simplefilter('ignore')
def input_file(file,li):
    data_ = pd.read_csv(
        f'file:{file}', encoding = 'sjis', na_values='-'
    )
    
    #カラム名にスペースが混じることがあるので削除
    data_.colums = [column.strip() for column in data_.columns]
    
    #日時はファイル名から取得
    try:
        timestamp = pd.Timestamp(re.findall(r'\d{8}', file.stem)[0])
        li.append(data_.assign(日時=timestamp))
    except:
        print(file.stem + 'ファイル名に日時がない')


# 株価一覧表のデータを読み込む

In [4]:
price_data = []
#PRICE_DIR以下にあるcsvファイルのパスをすべて取得する
file_list = list(pathlib.Path(PRICE_DIR).glob('*.csv'))

for file in tqdm(file_list):
    input_file(file,price_data)
    


  0%|          | 0/268 [00:00<?, ?it/s]

japan-all-stock-pricesファイル名に日時がない


In [5]:
price_data = pd.concat(price_data).assign(
    #時刻をすべて00:00:00に合わせる
    日時 = lambda x: x['日時'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(elm).date())
    )
)

# 投資指標データを読み込む

In [6]:
stock_data = []
#STOCK_DATA_DIR以下にあるcsvファイルのパスをすべて取得する
file_list = list(pathlib.Path(STOCK_DATA_DIR).glob('*.csv'))

for file in tqdm(file_list):
    input_file(file,stock_data)
    


  0%|          | 0/268 [00:00<?, ?it/s]

japan-all-stock-dataファイル名に日時がない


In [7]:
stock_data = pd.concat(stock_data).assign(
    #時刻をすべて00:00:00に合わせる
    日時 = lambda x: x['日時'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(elm).date())
    ),
    
    安値日付 = lambda x: x['安値日付'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(str(int(float(elm)))).date()) if not np.isnan(elm) else elm
    ),
    
    高値日付 = lambda x: x['高値日付'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(str(int(float(elm)))).date()) if not np.isnan(elm) else elm
    )
)

In [8]:
stock_data

Unnamed: 0,SC,名称,市場,業種,時価総額（百万円）,発行済株式数,配当利回り（予想）,1株配当（予想）,PER（予想）,PBR（実績）,EPS（予想）,BPS（実績）,最低購入額,単元株,高値日付,年初来高値,安値日付,年初来安値,日時
0,1,日経平均株価（日経225）,東証,株価指数,,,,,,,,,,,NaT,,NaT,,2021-02-03
1,2,TOPIX（東証株価指数）,東証,株価指数,,,,,,,,,,,NaT,,NaT,,2021-02-03
2,1301,極洋,東証一部,水産・農林,33386.0,10928283.0,2.29,70.0,10.95,0.94,279.08,3256.77,305500.0,100.0,2021-01-19,3145.0,2020-03-13,2202.0,2021-02-03
3,1332,日本水産,東証一部,水産・農林,141531.0,312430277.0,1.88,8.5,12.26,0.89,36.96,510.15,45300.0,100.0,2020-01-07,639.0,2020-03-13,398.0,2021-02-03
4,1333,マルハニチロ,東証一部,水産・農林,126113.0,52656910.0,1.67,40.0,19.69,0.92,121.62,2603.98,239500.0,100.0,2020-01-17,2780.0,2020-03-13,1846.0,2021-02-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3817,9993,ヤマザワ,東証一部,小売,19039.0,10960825.0,1.55,27.0,27.05,0.65,64.22,2659.67,173700.0,100.0,2020-10-09,1892.0,2020-03-13,1121.0,2021-02-02
3818,9994,やまや,東証一部,小売,23551.0,10847870.0,2.21,48.0,523.13,0.74,4.15,2917.68,217100.0,100.0,2020-06-23,2450.0,2020-03-17,1821.0,2021-02-02
3819,9995,グローセル,東証一部,卸売,13213.0,26426800.0,2.40,12.0,,0.55,-10.13,915.74,50000.0,100.0,2021-01-27,550.0,2020-04-06,317.0,2021-02-02
3820,9996,サトー商会,JQS,卸売,13720.0,9152640.0,2.00,30.0,31.91,0.57,46.98,2648.72,149900.0,100.0,2020-01-10,1736.0,2020-04-06,1310.0,2021-02-02


# 株価一覧表と投資指標データを結合する

In [9]:
daily_data = pd.merge(
    price_data,
    stock_data,
    on = ['SC', '時価総額（百万円）', '名称', '市場', '業種', '日時'],
    how = 'left'
)

In [10]:
daily_data

Unnamed: 0,SC,名称,市場,業種,日付,株価,前日比,前日比（％）,前日終値,始値,...,PER（予想）,PBR（実績）,EPS（予想）,BPS（実績）,最低購入額,単元株,高値日付,年初来高値,安値日付,年初来安値
0,1,日経平均株価（日経225）,東証,株価指数,20210203,28646.50,284.33,1.00,28362.17,28482.71,...,,,,,,,NaT,,NaT,
1,2,TOPIX（東証株価指数）,東証,株価指数,20210203,1871.09,24.07,1.30,1847.02,1858.50,...,,,,,,,NaT,,NaT,
2,1301,極洋,東証一部,水産・農林,20210203,3055.00,30.00,0.99,3025.00,3030.00,...,10.95,0.94,279.08,3256.77,305500.0,100.0,2021-01-19,3145.0,2020-03-13,2202.0
3,1332,日本水産,東証一部,水産・農林,20210203,453.00,11.00,2.49,442.00,442.00,...,12.26,0.89,36.96,510.15,45300.0,100.0,2020-01-07,639.0,2020-03-13,398.0
4,1333,マルハニチロ,東証一部,水産・農林,20210203,2395.00,47.00,2.00,2348.00,2351.00,...,19.69,0.92,121.62,2603.98,239500.0,100.0,2020-01-17,2780.0,2020-03-13,1846.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1024557,9993,ヤマザワ,東証一部,小売,20210202,1737.00,-5.00,-0.29,1742.00,1740.00,...,27.05,0.65,64.22,2659.67,173700.0,100.0,2020-10-09,1892.0,2020-03-13,1121.0
1024558,9994,やまや,東証一部,小売,20210202,2171.00,8.00,0.37,2163.00,2169.00,...,523.13,0.74,4.15,2917.68,217100.0,100.0,2020-06-23,2450.0,2020-03-17,1821.0
1024559,9995,グローセル,東証一部,卸売,20210202,500.00,-4.00,-0.79,504.00,502.00,...,,0.55,-10.13,915.74,50000.0,100.0,2021-01-27,550.0,2020-04-06,317.0
1024560,9996,サトー商会,JQS,卸売,20210202,1499.00,-10.00,-0.66,1509.00,1500.00,...,31.91,0.57,46.98,2648.72,149900.0,100.0,2020-01-10,1736.0,2020-04-06,1310.0


#　決算・財務・業績データを読み込む

In [11]:
financial_data = []
file_list = list(pathlib.Path(FIN_RESULTS_DIR).glob('*.csv'))

for file in tqdm(file_list):
    input_file(file,financial_data)
    


  0%|          | 0/14 [00:00<?, ?it/s]

japan-all-stock-financial-resultsファイル名に日時がない


In [12]:
financial_data = pd.concat(financial_data).assign(
    #時刻をすべて00:00:00に合わせる
    日時 = lambda x: x['日時'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(elm).date())
    ),
    kessan_tmp = lambda x: x['決算発表日（本決算）'].map(
        lambda elm: pd.Timestamp(pd.to_datetime(str(int(float(elm)))).date()) if not np.isnan(elm) else elm
    )
)


In [13]:
del financial_data['決算発表日（本決算）']
financial_data.rename(columns={'kessan_tmp': '決算発表日（本決算）'}, inplace=True)

# pickleで保存

In [14]:
daily_data.to_pickle(path.join(OUT_DIR, 'daily_data.pickle'))
financial_data.to_pickle(path.join(OUT_DIR, 'financial_data_all.pickle'))

In [15]:
daily_data

Unnamed: 0,SC,名称,市場,業種,日付,株価,前日比,前日比（％）,前日終値,始値,...,PER（予想）,PBR（実績）,EPS（予想）,BPS（実績）,最低購入額,単元株,高値日付,年初来高値,安値日付,年初来安値
0,1,日経平均株価（日経225）,東証,株価指数,20210203,28646.50,284.33,1.00,28362.17,28482.71,...,,,,,,,NaT,,NaT,
1,2,TOPIX（東証株価指数）,東証,株価指数,20210203,1871.09,24.07,1.30,1847.02,1858.50,...,,,,,,,NaT,,NaT,
2,1301,極洋,東証一部,水産・農林,20210203,3055.00,30.00,0.99,3025.00,3030.00,...,10.95,0.94,279.08,3256.77,305500.0,100.0,2021-01-19,3145.0,2020-03-13,2202.0
3,1332,日本水産,東証一部,水産・農林,20210203,453.00,11.00,2.49,442.00,442.00,...,12.26,0.89,36.96,510.15,45300.0,100.0,2020-01-07,639.0,2020-03-13,398.0
4,1333,マルハニチロ,東証一部,水産・農林,20210203,2395.00,47.00,2.00,2348.00,2351.00,...,19.69,0.92,121.62,2603.98,239500.0,100.0,2020-01-17,2780.0,2020-03-13,1846.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1024557,9993,ヤマザワ,東証一部,小売,20210202,1737.00,-5.00,-0.29,1742.00,1740.00,...,27.05,0.65,64.22,2659.67,173700.0,100.0,2020-10-09,1892.0,2020-03-13,1121.0
1024558,9994,やまや,東証一部,小売,20210202,2171.00,8.00,0.37,2163.00,2169.00,...,523.13,0.74,4.15,2917.68,217100.0,100.0,2020-06-23,2450.0,2020-03-17,1821.0
1024559,9995,グローセル,東証一部,卸売,20210202,500.00,-4.00,-0.79,504.00,502.00,...,,0.55,-10.13,915.74,50000.0,100.0,2021-01-27,550.0,2020-04-06,317.0
1024560,9996,サトー商会,JQS,卸売,20210202,1499.00,-10.00,-0.66,1509.00,1500.00,...,31.91,0.57,46.98,2648.72,149900.0,100.0,2020-01-10,1736.0,2020-04-06,1310.0


In [16]:
financial_data

Unnamed: 0,SC,名称,決算期,売上高（百万円）,営業利益（百万円）,経常利益（百万円）,当期利益（百万円）,総資産（百万円）,自己資本（百万円）,資本金（百万円）,有利子負債（百万円）,自己資本比率,ROE,ROA,発行済株式数,日時,決算発表日（本決算）
0,1301,極洋,202003.0,262519.0,2918.0,3608.0,2037.0,111184.0,32718.0,5664.0,54718.0,29.4,6.31,1.80,10928283.0,2020-12-07,2020-05-12
1,1332,日本水産,202003.0,690016.0,22834.0,25807.0,14768.0,491533.0,153152.0,30685.0,221238.0,31.2,9.86,3.05,312430277.0,2020-12-07,2020-05-20
2,1333,マルハニチロ,202003.0,905204.0,17079.0,19901.0,12537.0,528063.0,132628.0,20000.0,261714.0,25.1,9.72,2.39,52656910.0,2020-12-07,2020-05-14
3,1352,ホウスイ,202003.0,80492.0,839.0,757.0,454.0,40128.0,6368.0,2485.0,24275.0,15.9,7.29,1.11,8379000.0,2020-12-07,2020-05-15
4,1375,雪国まいたけ,,,,,,,,,,13.9,120.57,11.85,39850000.0,2020-12-07,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3819,9993,ヤマザワ,202002.0,109709.0,627.0,698.0,-220.0,50284.0,28147.0,2388.0,5116.0,56.0,-0.77,-0.44,10960825.0,2021-01-04,2020-04-10
3820,9994,やまや,202003.0,168168.0,4163.0,4227.0,205.0,63320.0,31925.0,3247.0,6361.0,50.4,0.64,0.30,10847870.0,2021-01-04,2020-06-24
3821,9995,グローセル,202003.0,68664.0,-61.0,5.0,65.0,32061.0,22550.0,5042.0,1935.0,70.3,0.28,0.20,26426800.0,2021-01-04,2020-05-14
3822,9996,サトー商会,202003.0,49562.0,1437.0,1659.0,1037.0,32271.0,23710.0,1405.0,683.0,73.5,4.44,3.12,9152640.0,2021-01-04,2020-05-13
