In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
from datetime import datetime
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import tushare as ts 
# 调取tushare api接口
pro = ts.pro_api('60967974046ffb16e01dff438e859cc4924749006a50beb8146f5bc2')

In [2]:
# 读取股票日行情数据
data = pd.read_feather('../../data/stk_daily.feather')
data = data[data['stk_id'].apply(lambda x: not x.endswith('BJ'))] # 去掉北交所股票
data

Unnamed: 0,stk_id,date,open,high,low,close,volume,amount,cumadj
0,000001.SZ,2020-01-02,16.65,16.95,16.55,16.87,153023000.0,2.571200e+09,98.0986
1,000001.SZ,2020-01-03,16.94,17.31,16.92,17.18,111619000.0,1.914500e+09,98.0986
2,000001.SZ,2020-01-06,17.01,17.34,16.91,17.07,86208400.0,1.477930e+09,98.0986
3,000001.SZ,2020-01-07,17.13,17.28,16.95,17.15,72860800.0,1.247050e+09,98.0986
4,000001.SZ,2020-01-08,17.00,17.05,16.63,16.66,84782400.0,1.423610e+09,98.0986
...,...,...,...,...,...,...,...,...,...
3168349,689009.SH,2022-12-26,32.55,32.98,32.03,32.25,2579640.0,8.356230e+07,1.0000
3168350,689009.SH,2022-12-27,32.75,32.88,32.11,32.79,2272600.0,7.412430e+07,1.0000
3168351,689009.SH,2022-12-28,32.81,32.90,32.05,32.16,1846270.0,5.959620e+07,1.0000
3168352,689009.SH,2022-12-29,32.28,32.28,30.78,30.84,4140600.0,1.300030e+08,1.0000


In [3]:
# 提取每日股票行情数据
data = list(data.groupby('stk_id'))

In [4]:
# 将股票日收盘价拼接成index为日期, columns为股票代码的DataFrame
close = pd.concat([d[1].set_index('date')['close'].rename(d[0]) for d in data], axis = 1)
close

Unnamed: 0_level_0,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,688787.SH,688788.SH,688789.SH,688793.SH,688798.SH,688799.SH,688800.SH,688819.SH,688981.SH,689009.SH
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,16.87,32.56,22.33,3.14,5.40,9.55,3.71,6.66,3.41,9.51,...,,,,,,,,,,
2020-01-03,17.18,32.05,22.35,3.15,5.48,9.51,3.75,6.40,3.41,9.49,...,,,,,,,,,,
2020-01-06,17.07,31.51,21.75,3.15,5.46,9.48,3.73,7.04,3.43,9.39,...,,,,,,,,,,
2020-01-07,17.15,31.76,22.10,3.20,5.49,9.51,3.79,7.16,3.54,9.55,...,,,,,,,,,,
2020-01-08,16.66,31.68,21.78,3.15,5.39,9.40,3.72,7.11,3.51,9.29,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,12.77,18.37,9.46,1.84,6.04,7.87,2.32,12.62,3.84,11.68,...,60.73,36.28,166.58,51.00,95.70,32.75,108.70,36.18,41.43,32.25
2022-12-27,13.11,18.51,9.70,1.89,6.64,7.84,2.34,12.26,3.86,11.93,...,60.01,36.65,165.91,52.49,98.09,31.93,111.00,36.76,41.48,32.79
2022-12-28,13.14,18.58,9.55,1.86,6.81,7.84,2.31,12.05,3.76,12.06,...,58.74,35.77,162.92,51.90,96.61,31.63,107.08,36.68,40.90,32.16
2022-12-29,13.03,18.16,9.56,1.85,6.13,7.78,2.29,11.98,3.62,11.47,...,59.02,35.99,162.89,49.77,96.65,31.94,113.18,36.52,40.89,30.84


In [5]:
# 从api下载沪深300成分股数据
df = []
df.append(pro.index_weight(index_code='399300.SZ', start_date='20200101', end_date='20200731').iloc[::-1])
df.append(pro.index_weight(index_code='399300.SZ', start_date='20200801', end_date='20201231').iloc[::-1])
df.append(pro.index_weight(index_code='399300.SZ', start_date='20210101', end_date='20210630').iloc[::-1])
df.append(pro.index_weight(index_code='399300.SZ', start_date='20210701', end_date='20211231').iloc[::-1])
df.append(pro.index_weight(index_code='399300.SZ', start_date='20220101', end_date='20220630').iloc[::-1])
df.append(pro.index_weight(index_code='399300.SZ', start_date='20220701', end_date='20221231').iloc[::-1])
df = pd.concat(df, axis = 0).set_index('index_code')
df

Unnamed: 0_level_0,con_code,trade_date,weight
index_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
399300.SZ,600299.SH,20200102,0.0236
399300.SZ,601298.SH,20200102,0.0241
399300.SZ,600928.SH,20200102,0.0270
399300.SZ,601162.SH,20200102,0.0270
399300.SZ,002958.SZ,20200102,0.0283
...,...,...,...
399300.SZ,000858.SZ,20221230,1.8800
399300.SZ,600036.SH,20221230,2.4720
399300.SZ,601318.SH,20221230,2.7300
399300.SZ,300750.SZ,20221230,3.0910


In [6]:
# 提取每日沪深300成分股数据
df = list(df.groupby('trade_date'))

In [7]:
# 将沪深300成分股数据拼接成index为日期, columns为股票代码的DataFrame
hs300 = pd.concat([pd.Series(close.columns.isin(d[1]['con_code'].values),
                             index = close.columns).rename(d[0]) for d in df], axis = 1).T
hs300.index = pd.to_datetime(hs300.index)
hs300

Unnamed: 0,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,688787.SH,688788.SH,688789.SH,688793.SH,688798.SH,688799.SH,688800.SH,688819.SH,688981.SH,689009.SH
2020-01-02,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-01-23,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-02-03,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-02-28,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-03-02,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-31,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2022-11-01,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2022-11-30,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2022-12-01,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [8]:
# 检验沪深300成分股调整时间
idx = [(hs300.iloc[i] != hs300.iloc[i + 1]).any() for i in range(len(hs300) - 1)]
hs300.index[1:][idx]

DatetimeIndex(['2020-06-30', '2020-12-31', '2021-06-30', '2021-12-31',
               '2022-06-30', '2022-12-30'],
              dtype='datetime64[ns]', freq=None)

半年调整一次成分股

In [9]:
# 调整index为日频
hs300 = hs300.resample('B').first().ffill().loc[close.index]
hs300

Unnamed: 0_level_0,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,688787.SH,688788.SH,688789.SH,688793.SH,688798.SH,688799.SH,688800.SH,688819.SH,688981.SH,689009.SH
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-06,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-07,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-08,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2022-12-27,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2022-12-28,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2022-12-29,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
# 将沪深300成分股数据保存到本文件夹(./newdata/)文件夹下
hs300.to_csv('./hs300.csv', index_label = False)