## 3_板块分类

1. DataApi(连接数据源)
2. query(获取需要的数据)
3. classify(数据格式的转换)
4. append_df(添加数据保存)

## 1_连接数据源

In [1]:
from jaqs.data.dataapi import DataApi
from jaqs.data import DataView

api = DataApi(addr='tcp://data.tushare.org:8910')
api.login("13662241013", 
          "eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI5NTU0OTgwMTUiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTM2NjIyNDEwMTMifQ.1M-daG0hl6T3hBtBk7EAg2BILN4YGyu7qXUSLFkynNQ"
)

('username: 13662241013', '0,')

## 2_获取需要的数据

### step1_指数成分

In [2]:
start=20120104
end=20171222

df_index, msg_index = api.query(
                view="lb.indexCons", 
                fields="", 
                filter="index_code={}&start_date={}&end_date={}".format('000300.SH',start,end), 
                data_format='pandas')

### step2_分类信息

In [3]:
df_classify, msg_classify = api.query(
                view="lb.secIndustry", 
                fields="", 
                filter="industry_src=中证指数有限公司&symbol=%s" % ",".join(df_index.symbol),
                data_format='pandas')

In [4]:
print(df_classify[['industry1_code', 'industry1_name']].drop_duplicates())

    industry1_code    industry1_name
0                J               金融业
1                K              房地产业
2                S                综合
3                C               制造业
6                D  电力、热力、燃气及水生产和供应业
11               L          租赁和商务服务业
13               N     水利、环境和公共设施管理业
15               R         文化、体育和娱乐业
39               B               采矿业
68               I   信息传输、软件和信息技术服务业
82               E               建筑业
83               F            批发和零售业
93               G       交通运输、仓储和邮政业
121              A          农、林、牧、渔业
227              H            住宿和餐饮业
361              Q           卫生和社会工作


## 3_数据格式的转换

### step1_改成英文名

In [5]:
columns = ["in_date", "industry1_code", "out_date", "symbol"]

classify = df_classify[columns]
classify_dict = {'A':"Agriculture", 'B':'Mining', 'C':'Manufacturing', 'D':'Energy', 'E': 'Construction','F':'Wholesale',
            'G':'Transportation','H':'Accommodation_Restaurants','I': 'Information_Technology', 'J':'Finance',
            'K':'Real_Estate','L':'Leasing_and_Commerical_Service','M':'Science_Technology','N':'Public_Facilities_Management',
            'P':'Education','Q':'Health_And_Social_Work','R':'Culture_Sports_Entertainment','S':'Synthesise_Industry'}
classify.industry1_code = classify.industry1_code.replace(classify_dict)
print(classify.head())

    in_date       industry1_code out_date     symbol
0  20130219              Finance           000001.SZ
1  20130219          Real_Estate           000002.SZ
2  20130219  Synthesise_Industry           000009.SZ
3  20130219        Manufacturing           000012.SZ
4  20130219        Manufacturing           000021.SZ


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### step2_获取交易时间

In [6]:
import pandas as pd
from datetime import datetime

trade_date, msg_date = api.query(
                view="jz.secTradeCal",
                fields="trade_date,istradeday",
                filter="start_date={}&end_date={}".format(start, end),
                data_format='pandas')

trade_date = trade_date["trade_date"][trade_date.istradeday == "T"]
date_index = pd.Index(map(lambda x: datetime.strptime(str(x),"%Y%m%d") , trade_date))

In [7]:
date_index

DatetimeIndex(['2012-01-04', '2012-01-05', '2012-01-06', '2012-01-09',
               '2012-01-10', '2012-01-11', '2012-01-12', '2012-01-13',
               '2012-01-16', '2012-01-17',
               ...
               '2017-12-11', '2017-12-12', '2017-12-13', '2017-12-14',
               '2017-12-15', '2017-12-18', '2017-12-19', '2017-12-20',
               '2017-12-21', '2017-12-22'],
              dtype='datetime64[ns]', length=1453, freq=None)

### step3_判断outdate

In [8]:
def convert(time):
    return datetime.strptime(time,"%Y%m%d")

def classify_df(select, index):
    try:
        out_date = convert(select.out_date)
    except:
        idx = index
    else:
        idx = index[index<out_date]
    finally:
        return select.symbol, pd.Series(select.industry1_code, idx)

### step4_生成新的DataFrame

In [9]:
classify_group = pd.DataFrame(dict([classify_df(row, date_index) for name, row in classify.iterrows()]))

In [10]:
classify_group.columns.name="symbol"
classify_group.index.name="trade_date"
group = classify_group.rename_axis(lambda s: s.year*10000+s.month*100+s.day)

In [11]:
print(group.head())

symbol     000001.SZ    000002.SZ                  000008.SZ  \
trade_date                                                     
20120104     Finance  Real_Estate  Accommodation_Restaurants   
20120105     Finance  Real_Estate  Accommodation_Restaurants   
20120106     Finance  Real_Estate  Accommodation_Restaurants   
20120109     Finance  Real_Estate  Accommodation_Restaurants   
20120110     Finance  Real_Estate  Accommodation_Restaurants   

symbol                000009.SZ      000012.SZ      000021.SZ    000024.SZ  \
trade_date                                                                   
20120104    Synthesise_Industry  Manufacturing  Manufacturing  Real_Estate   
20120105    Synthesise_Industry  Manufacturing  Manufacturing  Real_Estate   
20120106    Synthesise_Industry  Manufacturing  Manufacturing  Real_Estate   
20120109    Synthesise_Industry  Manufacturing  Manufacturing  Real_Estate   
20120110    Synthesise_Industry  Manufacturing  Manufacturing  Real_Estate   

symb

## 4_添加数据保存

In [12]:
dv = DataView()
dataview_folder = 'JAQS_Data/hs300'
dv.load_dataview(dataview_folder)
dv.append_df(group, 'group')

Dataview loaded successfully.


In [13]:
dv.save_dataview('JAQS_Data/hs300')


Store data...
Dataview has been successfully saved to:
C:\Users\small\OneDrive\notebook\Internet_Course\JAQS\JAQS_Data\hs300

You can load it with load_dataview('C:\Users\small\OneDrive\notebook\Internet_Course\JAQS\JAQS_Data\hs300')
