In [1]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}
    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("date must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid date type: " + str(type(x)))
    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)
    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)
    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    return query

def write_memb_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            for date in df['date'].unique():
                if date <= m_ax:
                    query = build_query(int(date), int(date), symbol)
                    print(query)
                    collection.delete_many(query)
            df2 = df[(df['index_id'] == symbol)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['index_id'] == symbol)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 

def delete_memb_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)  

def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df    

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

In [6]:
li_st = read_memb_daily(db1, 'index_memb', 20201119, 20201119)
li_st = li_st[~li_st['index_id'].isin([1000300, 1000852, 1000905, 1000985])]['index_id'].unique()
delete_memb_data(db1, 'index_memb', 20201113, 20201119, index_id=list(li_st))

In [147]:
read_memb_daily(db1, 'index_memb', 20200801, 20200831, skey=[1600519])['index_name'].unique()

array(['IF', 'AMAC 饮料'], dtype=object)

In [143]:
read_stock_daily(db1, 'mdbar1d_tr', 20201112, 20201113, skey=[1000016])

Unnamed: 0,skey,date,time,name,trade_status,listed_days,open,high,low,close,closeL1,yclose,ztClose,dtClose,dayReturn,volume,amount,buy_volume,sell_volume,TORate,allZT,hasZT,isZT,allDT,hasDT,isDT,isST,VWAP,TWAP,marketValue,marketShares,totalShares,SW1_name,SW1_code,SW2_code,SW3_code,index_name,index_weight,exchange
0,1000016,20201112,180000000000,上证50,1,4097,3413.2074,3420.6475,3387.33,3397.3544,3411.2295,3411.2295,3752.35,3070.11,-0.004067,2709848200,63549020000.0,5920467.0,21178015.0,0.0,0,0,0,0,0,0,0,2345.1136,3397.868545,0.0,0.0,0.0,0,0,0,0,,0.0,SSE
1,1000016,20201113,180000000000,上证50,1,4098,3376.9114,3377.1596,3318.4516,3338.3101,3397.3544,3397.3544,3737.09,3057.62,-0.017379,3451369600,73716430000.0,0.0,34513696.0,0.0,0,0,0,0,0,0,0,2135.860271,3333.627404,0.0,0.0,0.0,0,0,0,0,,0.0,SSE


## 1. IC, IF, CSI1000, CSIRest stocks

In [2]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import TSLPy3

startDate = '20201217'
endDate = '20201217'
def download_index(startDate, endDate, indexCode):
    tsstr = """
               indexTicker:= '{}';
               BegT:= {};
               EndT:= {} + 0.99;
               dateArr:=MarketTradeDayQk(BegT,EndT);
               r:=array();
               for nI:=0 to length(dateArr)-1 do
               begin
                 GetBKWeightByDate(indexTicker,dateArr[nI],t);
                 t := t[:,array("截止日","代码","比例(%)")]; 
                 r:=r union t;
               end;
               return r;  
            """.format(indexCode, startDate + 'T', endDate + 'T')
    weight_table = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])
    weight_table.columns=['date','weight','ID']
    weight_table['date'] = pd.to_datetime(weight_table.date.astype(str))
    return weight_table
IF_weight = download_index(startDate, endDate, 'SH000300')
IC_weight = download_index(startDate, endDate, 'SH000905')
CSI1000_weight = download_index(startDate, endDate, 'SH000852')

In [7]:
IF_weight[IF_weight['ID'] == b'SZ000709']

Unnamed: 0,date,weight,ID
279,2020-12-17,0.052205,b'SZ000709'


In [5]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import TSLPy3

startDate = '20201106'
endDate = '20201112'
def download_index(startDate, endDate, indexCode):
    tsstr = """
               indexTicker:= '{}';
               BegT:= {};
               EndT:= {} + 0.99;
               dateArr:=MarketTradeDayQk(BegT,EndT);
               r:=array();
               for nI:=0 to length(dateArr)-1 do
               begin
                 GetBKWeightByDate(indexTicker,dateArr[nI],t);
                 t := t[:,array("截止日","代码","比例(%)")]; 
                 r:=r union t;
               end;
               return r;  
            """.format(indexCode, startDate + 'T', endDate + 'T')
    weight_table = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])
    weight_table.columns=['date','weight','ID']
    weight_table['ID'] = weight_table['ID'].str.decode('GBK')
    weight_table['date'] = pd.to_datetime(weight_table.date.astype(str))
    return weight_table
weight_table = download_index(startDate, endDate, 'SH000985')

CSIRest_weight = []
for day in weight_table.date.unique():
    IC_stock = list(IC_weight[IC_weight.date == day].ID.unique())
    IF_stock = list(IF_weight[IF_weight.date == day].ID.unique())
    CSI1000_stock = list(CSI1000_weight[CSI1000_weight.date == day].ID.unique())
    ex_stock = list(set(IC_stock + IF_stock + CSI1000_stock))
    assert len(ex_stock) == 1800
    CSIRest_weight_day = weight_table[(weight_table.date == day) & (~weight_table.ID.isin(ex_stock))]
    CSIRest_weight += [CSIRest_weight_day]
CSIRest_weight = pd.concat(CSIRest_weight).reset_index(drop=True)
sumWeightToday = CSIRest_weight.groupby('date')['weight'].sum().reset_index()
sumWeightToday.rename(columns = {'weight':'sumWeightDay'}, inplace = True)
weight_table = CSIRest_weight.merge(sumWeightToday, on = 'date', how = 'left')
weight_table['weight'] = weight_table['weight'] / weight_table['sumWeightDay'] * 100
weight_table = weight_table.drop(columns = {'sumWeightDay'})

In [6]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import TSLPy3
# IF_weight = pd.read_pickle(r'G:\IF_weight.pkl')
# IC_weight = pd.read_pickle(r'G:\IC_weight.pkl')
# CSI1000_weight = pd.read_pickle(r'G:\CSI1000_weight.pkl')
# weight_table = pd.read_pickle(r'G:\CSIRest_weight.pkl')

IF_weight['date'] = IF_weight['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
IF_weight['ID'] = np.where(IF_weight['ID'].str[:2] =='SZ', IF_weight['ID'].str[2:].astype(int) + 2000000, IF_weight['ID'].str[2:].astype(int) + 1000000)
IF_weight = IF_weight.rename(columns={'ID':'skey'})
IF_weight['index_id'] = 1000300
IF_weight['index_name'] = 'IF'
IF_weight = IF_weight.sort_values(by=['date', 'skey']).reset_index(drop=True)
k = IF_weight.groupby('date')['weight'].sum().reset_index()
assert(k[k['weight'] - 100 > 0.02].shape[0] == 0)
write_memb_data(db1, 'index_memb', IF_weight)

IC_weight['date'] = IC_weight['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
IC_weight['ID'] = np.where(IC_weight['ID'].str[:2] =='SZ', IC_weight['ID'].str[2:].astype(int) + 2000000, IC_weight['ID'].str[2:].astype(int) + 1000000)
IC_weight = IC_weight.rename(columns={'ID':'skey'})
IC_weight['index_id'] = 1000905
IC_weight['index_name'] = 'IC'
IC_weight = IC_weight.sort_values(by=['date', 'skey']).reset_index(drop=True)
k = IC_weight.groupby('date')['weight'].sum().reset_index()
assert(k[k['weight'] - 100 > 0.02].shape[0] == 0)
write_memb_data(db1, 'index_memb', IC_weight)

CSI1000_weight['date'] = CSI1000_weight['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
CSI1000_weight['ID'] = np.where(CSI1000_weight['ID'].str[:2] =='SZ', CSI1000_weight['ID'].str[2:].astype(int) + 2000000, CSI1000_weight['ID'].str[2:].astype(int) + 1000000)
CSI1000_weight = CSI1000_weight.rename(columns={'ID':'skey'})
CSI1000_weight['index_id'] = 1000852
CSI1000_weight['index_name'] = 'CSI1000'
CSI1000_weight = CSI1000_weight.sort_values(by=['date', 'skey']).reset_index(drop=True)
k = CSI1000_weight.groupby('date')['weight'].sum().reset_index()
assert(k[k['weight'] - 100 > 0.02].shape[0] == 0)
write_memb_data(db1, 'index_memb', CSI1000_weight)

weight_table['date'] = weight_table['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
weight_table['ID'] = np.where(weight_table['ID'].str[:2] =='SZ', weight_table['ID'].str[2:].astype(int) + 2000000, weight_table['ID'].str[2:].astype(int) + 1000000)
weight_table = weight_table.rename(columns={'ID':'skey'})
weight_table['index_id'] = 1000985
weight_table['index_name'] = 'CSIRest'
weight_table = weight_table.sort_values(by=['date', 'skey']).reset_index(drop=True)
k = weight_table.groupby('date')['weight'].sum().reset_index()
assert(k[k['weight'] - 100 > 0.02].shape[0] == 0)
write_memb_data(db1, 'index_memb', weight_table)

          date    weight     skey  index_id index_name
0     20201106  0.632398  1600000   1000300         IF
1     20201106  0.077341  1600004   1000300         IF
2     20201106  0.369904  1600009   1000300         IF
3     20201106  0.150251  1600010   1000300         IF
4     20201106  0.129552  1600011   1000300         IF
...        ...       ...      ...       ...        ...
1495  20201112  0.291354  2300413   1000300         IF
1496  20201112  0.247766  2300433   1000300         IF
1497  20201112  0.482410  2300498   1000300         IF
1498  20201112  0.312795  2300601   1000300         IF
1499  20201112  0.106436  2300628   1000300         IF

[1500 rows x 5 columns]
          date    weight     skey  index_id index_name
0     20201106  0.107086  1600006   1000905         IC
1     20201106  0.252643  1600008   1000905         IC
2     20201106  0.096796  1600017   1000905         IC
3     20201106  0.178000  1600021   1000905         IC
4     20201106  0.143801  1600022   1000

In [14]:
kk = read_memb_daily(db1, 'index_memb', 20200918, 20200921, index_id=[1000300, 3011046])
kk

Unnamed: 0,date,weight,skey,index_id,index_name
0,20200918,0.692216,1600000,1000300,IF
1,20200918,0.090169,1600004,1000300,IF
2,20200918,0.428193,1600009,1000300,IF
3,20200918,0.159185,1600010,1000300,IF
4,20200918,0.150818,1600011,1000300,IF
...,...,...,...,...,...
831,20200921,0.000000,2300023,3011046,AMAC 金融
832,20200921,0.640000,2300033,3011046,AMAC 金融
833,20200921,3.200000,2300059,3011046,AMAC 金融
834,20200921,0.040000,2300309,3011046,AMAC 金融


In [43]:
startDate = '20200813'
endDate = '20200814'

import glob
import os
readPath = r'F:\Download\StockFactors'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'FactorData***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
total_stock = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath, encoding='GBK')
    total_stock += [data]
total_stock = pd.concat(total_stock, sort=False)
total_stock['skey'] = np.where(total_stock['Symbol'].str[:2] == 'SH', total_stock['Symbol'].str[2:].astype(int) + 1000000, 
                              total_stock['Symbol'].str[2:].astype(int) + 2000000)
total_stock

Unnamed: 0,Date,Symbol,Name,ListDays,SWIndustry1,Totalshares,Marketshares,TotalValue,MarketValue,close,open,ret,volume,amt,trade_stats,margin_balance,margin_ratio,lgt_holding,lgt_ratio,roe,roa,sale_margin,g_net_profit,g_sales,log_size,pe_ttm,pb_ttm,ps_ttm,pcf_ttm,st_mom,mt_mom,lt_mom,st_liq,mt_liq,lt_liq,res_vol_300,res_vol_500,st_vol,mt_vol,lt_vol,beta_300,beta_500,current_ratio,quick_ratio,asset_lib_ratio,skey
0,20200813,SH600000,浦发银行,4913,申万银行,2.935211e+10,2.810379e+10,3.084907e+07,2.953709e+07,10.51,10.60,-0.473485,25798477,2.716254e+08,1,<NIL>,0,<NIL>,0,10.394144,0.836164,30.512478,5.480284,10.662088,17.201157,5.157586,0.536087,1.573707,-1.907997,1.057692,-1.051497,7.074708,0.160823,0.207229,0.177866,0.965671,1.190233,1.406097,1.685491,1.507056,0.774898,0.504654,0.000000,0.000000,92.016972,1600000
1,20200813,SH600004,白云机场,4154,申万交通运输,2.069321e+09,2.069321e+09,2.986030e+06,2.986030e+06,14.43,14.85,-2.434077,33592114,4.897567e+08,1,<NIL>,0,<NIL>,0,4.186098,2.713447,9.663436,-125.879905,-34.577881,14.909455,43.064485,1.802721,4.161509,9.282456,3.797543,-3.168272,-3.934245,1.431626,1.294118,1.192130,2.135742,2.259992,1.768164,2.219221,2.810349,1.038641,0.765897,0.376225,0.370041,35.833969,1600004
2,20200813,SH600006,东风汽车,4996,申万汽车,2.000000e+09,2.000000e+09,9.880000e+05,9.880000e+05,4.94,4.88,1.022495,66689546,3.316491e+08,1,<NIL>,0,<NIL>,0,4.047998,1.652536,2.395763,-123.827945,-33.716895,13.803438,33.072978,1.338794,0.792350,-66.440711,14.087760,18.181818,25.882889,3.632977,1.781493,1.257318,1.831950,1.708661,2.352114,1.965526,2.403914,0.846309,0.841528,1.413754,1.185651,55.820917,1600006
3,20200813,SH600007,中国国贸,5141,申万房地产,1.007283e+09,1.007283e+09,1.359831e+06,1.359831e+06,13.50,13.37,1.047904,2679002,3.617269e+07,1,<NIL>,0,<NIL>,0,11.979316,7.836050,27.355766,-16.778890,-14.689142,14.122871,14.601240,1.749129,3.994281,9.325639,1.809955,-0.735294,3.377443,0.387054,0.344011,0.339282,1.468679,1.583946,1.880130,1.839811,2.023222,0.858458,0.639685,1.305017,1.283430,35.016117,1600007
4,20200813,SH600008,首创股份,4833,申万公用事业,5.685448e+09,5.685448e+09,1.944423e+06,1.944423e+06,3.42,3.31,3.636364,122515052,4.209226e+08,1,<NIL>,0,<NIL>,0,4.481721,1.176583,6.242299,1.757424,19.545533,14.480476,20.245457,0.907345,1.263782,5.050677,3.951368,3.636364,7.201835,0.827007,0.955513,0.664849,1.098049,1.042690,2.077455,1.718766,1.627673,0.792802,0.718293,0.832597,0.813347,65.962243,1600008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,20200813,SZ300855,图南股份,16,申万有色金属,2.000000e+08,5.000000e+07,1.084000e+06,2.710000e+05,54.20,59.10,-5.640669,25147989,1.462970e+09,1,<NIL>,0,<NIL>,0,20.504617,16.568147,20.933378,3.047359,4.882131,12.509874,105.914418,21.717346,22.171465,139.854580,122.587269,415.699334,415.699334,27.529657,8.931271,8.931271,3.958023,3.951541,3.959501,3.959501,3.959501,0.070169,-0.132437,3.213426,1.235385,18.373866,2300855
3916,20200813,SZ300856,科思股份,17,申万化工,1.128800e+08,2.822000e+07,9.205364e+05,2.301341e+05,81.55,85.56,-5.798776,5915878,4.960089e+08,1,<NIL>,0,<NIL>,0,28.197609,16.089359,13.968061,77.164600,13.225587,12.346417,59.892399,16.888224,8.365807,58.423679,4.591510,166.852094,166.852094,24.779804,22.094196,22.094196,7.256647,7.246757,7.256886,7.256886,7.256886,0.039577,-0.209382,1.108400,0.489765,46.608919,2300856
3917,20200813,SZ300857,协创数据,14,申万电子,2.065578e+08,5.163945e+07,9.547101e+05,2.386775e+05,46.22,46.22,9.995240,989455,4.573261e+07,1,<NIL>,0,<NIL>,0,18.348987,10.168458,7.236839,321.240197,81.507138,12.382869,76.607229,14.056651,5.543942,128.655738,159.371493,396.989247,396.989247,1.370195,0.598111,0.598111,0.008686,0.008532,0.009087,0.009087,0.009087,-0.002603,-0.002314,2.258530,1.278173,45.054449,2300857
3918,20200813,SZ300858,科拓生物,14,申万食品饮料,8.251769e+07,2.063000e+07,7.033808e+05,1.758501e+05,85.24,87.00,-1.125159,6022680,5.250647e+08,1,<NIL>,0,<NIL>,0,22.989126,20.499869,30.278606,16.167434,13.180310,12.077387,73.744151,16.953136,22.328701,76.436641,87.670630,259.662447,259.662447,40.539337,14.676885,14.676885,5.373346,5.429348,5.631420,5.631420,5.631420,1.643208,1.105968,4.995882,4.441833,10.695181,2300858


In [45]:
set(total_stock['skey'].unique()) - set(test[test['index_id'] > 3000000]['skey'].unique())

{1600091,
 1600145,
 1600228,
 1600234,
 1600247,
 1600265,
 1600275,
 1600301,
 1600423,
 1600608,
 1600654,
 1600725,
 1600767,
 1600817,
 1600877,
 1601456,
 1605066,
 1605100,
 1605158,
 1605222,
 1605318,
 1605366,
 1605399,
 1688001,
 1688002,
 1688003,
 1688004,
 1688005,
 1688006,
 1688007,
 1688008,
 1688009,
 1688010,
 1688011,
 1688012,
 1688015,
 1688016,
 1688018,
 1688019,
 1688020,
 1688021,
 1688022,
 1688023,
 1688025,
 1688026,
 1688027,
 1688028,
 1688029,
 1688030,
 1688033,
 1688036,
 1688037,
 1688039,
 1688050,
 1688051,
 1688058,
 1688060,
 1688065,
 1688066,
 1688068,
 1688069,
 1688077,
 1688078,
 1688080,
 1688081,
 1688085,
 1688086,
 1688088,
 1688089,
 1688090,
 1688096,
 1688098,
 1688099,
 1688100,
 1688101,
 1688106,
 1688108,
 1688111,
 1688116,
 1688118,
 1688122,
 1688123,
 1688126,
 1688128,
 1688138,
 1688139,
 1688155,
 1688157,
 1688158,
 1688159,
 1688165,
 1688166,
 1688168,
 1688169,
 1688177,
 1688178,
 1688180,
 1688181,
 1688185,
 1688186,


In [45]:
try:
    test = w.wset("indexconstituent","date=%s; windcode=%s"%(d, i))
    assert(int(test.Data[0][0].strftime('%Y%m%d')) == d)
except:
    print('Wind data not ready')
    break

SyntaxError: 'break' outside loop (<ipython-input-45-0a345f4569ea>, line 9)

## 2. AMAC stocks

In [6]:
from WindPy import *
w.start()
il = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
il['StockID'] = il['StockID'].str[3:] + '.CSI'
il = il['StockID'].values
d = 20201229
test = w.wset("indexconstituent","date=%s; windcode=%s"%(d, il[0]))
assert(int(test.Data[0][0].strftime('%Y%m%d')) == d)

In [15]:
np.sum(w.wset("indexconstituent","date=%s; windcode=%s"%(d, 'H11043.CSI')).Data[3])

100.02000000000001

In [3]:
from WindPy import *
w.start()


import os
import glob
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',200)
pd.options.mode.chained_assignment = None
import numpy as np

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

il = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
il['StockID'] = il['StockID'].str[3:] + '.CSI'
il = il['StockID'].values

import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import TSLPy3

def updateAShare(date):
    TRDate = str(date)
    tsstr = """
           BegT:=%s;
           EndT:=%s;
           SetSysParam(pn_stock(),'SH000001');
           SetSysParam(PN_Cycle(),cy_day());
           dateArr:=MarketTradeDayQk(BegT,EndT);
           r:=array();
           for nI:=0 to length(dateArr)-1 do
           begin
             echo dateArr[nI];
             t:= getabkbydate('A股',dateArr[nI]);
             r:=r union2 t;
           end;
           r:= select [0] as 'StockID' from `r end;
           r := select * from r order by ['StockID'] end;
           return r;
            """%(TRDate + 'T', TRDate + 'T + 0.99')
    stockList = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])
    stockList.columns = list(pd.Series(stockList.columns).str.decode('GBK'))
    stockList['StockID'] = stockList['StockID'].str.decode('GBK')
    stockList['skey'] = np.where(stockList['StockID'].str[:2] == 'SH', 1000000 + stockList['StockID'].str[2:].astype(int),
                                2000000 + stockList['StockID'].str[2:].astype(int))
    stockList['date'] = int(TRDate)
    return stockList

pd.set_option('max_columns', 200)
db = DB("192.168.10.178", database_name, user, password)
# dateLs = read_memb_daily(db, 'index_memb', index_id=[1000300], start_date=20200813, end_date=20201101)['date'].astype(str).\
# apply(lambda x: datetime.datetime.strptime(x,"%Y%m%d")).unique()
# dateLs1 = [datetime.datetime.fromtimestamp((i-np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')).weekday() for i in dateLs]
# dateLs = np.array(dateLs)[np.array(dateLs1) == 3]
# dateLs = [int(datetime.datetime.fromtimestamp((i-np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')).strftime('%Y%m%d')) for i in dateLs]
dateLs = [20201215]
for d in dateLs:
    dl = [d]
    total_stock = []
    for d in dl:
        data = updateAShare(d)
        total_stock += [data]
    total_stock = pd.concat(total_stock, sort=False)


    try:
        test = w.wset("indexconstituent","date=%s; windcode=%s"%(d, il[0]))
        assert(int(test.Data[0][0].strftime('%Y%m%d')) == d)

        data2 = []
        add = []
        save = []
        startTm = datetime.datetime.now()
        for d in dl:
            data1 = []
            for i in il:
                data = w.wset("indexconstituent","date=%s; windcode=%s"%(d, i))
                df = pd.DataFrame(data=np.array(data.Data).T, columns=data.Fields)
                df['index_id'] = 3000000 + int(i[1:6])
                try:
                    assert(abs(df['i_weight'].sum() - 100) < 0.1)
                except:
                    print('sum of weight far from 100')
                    print(df['i_weight'].sum())
                    print(df)
                    save += [df]
                df['skey'] = df['wind_code'].str[:-3].astype(int)
                df['skey'] = np.where(df['skey'] < 600000, df['skey'] + 2000000, df['skey'] + 1000000)
                df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
                corrections = {3011030: 'AMAC 农林',    
                       3011031: 'AMAC 采矿', 
                       3011041: 'AMAC 公用', 
                       3011042: 'AMAC 建筑', 
                       3011043: 'AMAC 交运',    
                       3011044: 'AMAC 信息', 
                       3011045: 'AMAC 批零', 
                       3011046: 'AMAC 金融', 
                       3011047: 'AMAC 地产',    
                       3011049: 'AMAC 文体', 
                       3011050: 'AMAC 综企', 
                       3030036: 'AMAC 餐饮',  
                       3030037: 'AMAC 商务',    
                       3030038: 'AMAC 科技', 
                       3030039: 'AMAC 公共', 
                       3030040: 'AMAC 社会',
                       3030041: 'AMAC 农副', 
                       3030042: 'AMAC 食品', 
                       3030043: 'AMAC 饮料', 
                       3030044: 'AMAC 纺织',    
                       3030045: 'AMAC 服装', 
                       3030046: 'AMAC 皮革', 
                       3030047: 'AMAC 木材', 
                       3030048: 'AMAC 家具',    
                       3030049: 'AMAC 造纸', 
                       3030050: 'AMAC 印刷', 
                       3030051: 'AMAC 文教',  
                       3030052: 'AMAC 石化',    
                       3030053: 'AMAC 化学', 
                       3030054: 'AMAC 医药', 
                       3030055: 'AMAC 化纤',  
                       3030056: 'AMAC 橡胶', 
                       3030057: 'AMAC 矿物', 
                       3030058: 'AMAC 钢铁', 
                       3030059: 'AMAC 有色',    
                       3030060: 'AMAC 金属', 
                       3030061: 'AMAC 通用', 
                       3030062: 'AMAC 专用', 
                       3030063: 'AMAC 汽车',    
                       3030064: 'AMAC 运输', 
                       3030065: 'AMAC 电气', 
                       3030066: 'AMAC 电子',  
                       3030067: 'AMAC 仪表'}
                df['index_name'] = df['index_id']
                df.index_name = df.index_name.map(corrections)
                data1 += [df]
            data1 = pd.concat(data1).reset_index(drop=True)
            stock_list = list(set(total_stock[total_stock['date'] == d]['skey'].unique()) - set(data1['skey'].unique()))
            stock_list =[str(i - 1000000).rjust(6, "0") + '.SH' if i < 2000000 else str(i - 2000000).rjust(6, "0") + '.SZ' for i in stock_list]
            dd = str(d)[:4] + '-' + str(d)[4:6] + '-' + str(d)[6:]
            add1 = pd.DataFrame(columns=['date', 'stock_list'])
            add1['stock_list'] = stock_list
            add1['date'] = dd
            add += [add1]
            data1 = data1.rename(columns={'i_weight': 'weight'})
            data1 = data1[['date', 'skey', 'index_id', 'index_name', 'weight']]
            data2 += [data1]
        data2 = pd.concat(data2).reset_index(drop=True)
        data2 = data2.drop_duplicates(keep='first')
        add = pd.concat(add).reset_index(drop=True)
        print('get index composition weight')
        print(datetime.datetime.now() - startTm)

        data3 = []
        startTm = datetime.datetime.now()
        stock_list = add['stock_list'].unique()    
        for s in stock_list:
            start_date = add[add['stock_list'] == s]['date'].min()
            end_date = add[add['stock_list'] == s]['date'].max()
            add_data = w.wsd(s, "industry_CSRCcode12", start_date, end_date, "industryType=3;PriceAdj=F")
            if add_data.ErrorCode != 0:
                continue
            nd = pd.DataFrame(data=np.array(add_data.Data).T, columns=['Ind'])
            nd1 = pd.DataFrame(data=np.array(add_data.Times).T, columns=['DateTime'])
            nd = pd.concat([nd1, nd], axis=1)
            nd = nd[~nd['Ind'].isnull()]
            if nd.empty:
                continue
            else:
                nd['index_id'] = nd['Ind'].str[1:].astype(int)
                nd['date'] = nd['DateTime'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)
                if s[-2:] == 'SZ':
                    nd['skey'] = int(s[:-3]) + 2000000
                else:
                    nd['skey'] = int(s[:-3]) + 1000000
                nd['weight'] = 0
                data3 += [nd]
        data3 = pd.concat(data3).reset_index(drop=True)
        print(datetime.datetime.now() - startTm)
        print('get extra data')

        data2 = pd.concat([data2, data3])
        data2 = data2.sort_values(by=['date', 'skey', 'weight']).reset_index(drop=True)
        if data2[data2.duplicated(['date', 'skey'], keep=False)].shape[0] != 0:
            display(data2[data2.duplicated(['date', 'skey'], keep=False)])
            data2 = data2.drop_duplicates(['date', 'skey'], keep='last').reset_index(drop=True)
        assert(data2[data2['index_id'] < 100]['weight'].unique() == [0])

        data2['index_id'] = np.where(data2.index_id <= 5, 3011030,np.where(data2.index_id <= 12, 3011031,                    np.where(data2.index_id == 13, 3030041,np.where(data2.index_id == 14, 3030042,                    np.where(data2.index_id == 15, 3030043,np.where(data2.index_id == 17, 3030044,                   np.where(data2.index_id == 18, 3030045,np.where(data2.index_id == 19, 3030046,                   np.where(data2.index_id == 20, 3030047,np.where(data2.index_id == 21, 3030048,                   np.where(data2.index_id == 22, 3030049,np.where(data2.index_id == 23, 3030050,                   np.where(data2.index_id == 24, 3030051,np.where(data2.index_id == 25, 3030052,                   np.where(data2.index_id == 26, 3030053,np.where(data2.index_id == 27, 3030054,                   np.where(data2.index_id == 28, 3030055,np.where(data2.index_id == 29, 3030056,                   np.where(data2.index_id == 30, 3030057,np.where(data2.index_id == 31, 3030058,                   np.where(data2.index_id == 32, 3030059,np.where(data2.index_id == 33, 3030060,                   np.where(data2.index_id == 34, 3030061,np.where(data2.index_id == 35, 3030062,                   np.where(data2.index_id == 36, 3030063,np.where(data2.index_id == 37, 3030064,                   np.where(data2.index_id == 38, 3030065,np.where(data2.index_id == 39, 3030066,                   np.where(data2.index_id == 40, 3030067,np.where(data2.index_id <= 43, 3011050,                   np.where(data2.index_id <= 46, 3011041,np.where(data2.index_id <= 50, 3011042,                   np.where(data2.index_id <= 52, 3011045,np.where(data2.index_id <= 60, 3011043,                   np.where(data2.index_id <= 62, 3030036,np.where(data2.index_id <= 65, 3011044,                   np.where(data2.index_id <= 69, 3011046,np.where(data2.index_id == 70, 3011047,                   np.where(data2.index_id <= 72, 3030037,np.where(data2.index_id <= 75, 3030038,                   np.where(data2.index_id <= 78, 3030039,np.where(data2.index_id <= 81, 3030040,                   np.where(data2.index_id == 82, 3011049,np.where(data2.index_id <= 84, 3030040,                   np.where(data2.index_id <= 89, 3011049,np.where(data2.index_id == 90, 3011050, data2['index_id']))))))))))))))))))))))))))))))))))))))))))))))
        assert(data2['index_id'].min() > 100)
        corrections = {3011030: 'AMAC 农林',    
                       3011031: 'AMAC 采矿', 
                       3011041: 'AMAC 公用', 
                       3011042: 'AMAC 建筑', 
                       3011043: 'AMAC 交运',    
                       3011044: 'AMAC 信息', 
                       3011045: 'AMAC 批零', 
                       3011046: 'AMAC 金融', 
                       3011047: 'AMAC 地产',    
                       3011049: 'AMAC 文体', 
                       3011050: 'AMAC 综企', 
                       3030036: 'AMAC 餐饮',  
                       3030037: 'AMAC 商务',    
                       3030038: 'AMAC 科技', 
                       3030039: 'AMAC 公共', 
                       3030040: 'AMAC 社会',
                       3030041: 'AMAC 农副', 
                       3030042: 'AMAC 食品', 
                       3030043: 'AMAC 饮料', 
                       3030044: 'AMAC 纺织',    
                       3030045: 'AMAC 服装', 
                       3030046: 'AMAC 皮革', 
                       3030047: 'AMAC 木材', 
                       3030048: 'AMAC 家具',    
                       3030049: 'AMAC 造纸', 
                       3030050: 'AMAC 印刷', 
                       3030051: 'AMAC 文教',  
                       3030052: 'AMAC 石化',    
                       3030053: 'AMAC 化学', 
                       3030054: 'AMAC 医药', 
                       3030055: 'AMAC 化纤',  
                       3030056: 'AMAC 橡胶', 
                       3030057: 'AMAC 矿物', 
                       3030058: 'AMAC 钢铁', 
                       3030059: 'AMAC 有色',    
                       3030060: 'AMAC 金属', 
                       3030061: 'AMAC 通用', 
                       3030062: 'AMAC 专用', 
                       3030063: 'AMAC 汽车',    
                       3030064: 'AMAC 运输', 
                       3030065: 'AMAC 电气', 
                       3030066: 'AMAC 电子',  
                       3030067: 'AMAC 仪表'}
        data2['index_name'] = data2['index_id']
        data2.index_name = data2.index_name.map(corrections)
        data2 = data2[['date', 'skey', 'index_id', 'index_name', 'weight']]
        try:
            assert(abs(data2.groupby(['date', 'index_id'])['weight'].sum() - 100).max() < 0.1)
        except:
            print(data2.groupby(['date', 'index_id'])['weight'].sum())
        write_memb_data(db1, 'index_memb', data2)
        data2.to_csv('E:\\AMAC_weight\\' + str(dl[0]) + '_check.csv')

    except:
        print('Wind data not ready')

sum of weight far from 100
99.77999999999994
          date  wind_code sec_name i_weight industry  index_id
0   2020-12-15  000002.SZ      万科A    20.19      房地产   3011047
1   2020-12-15  000006.SZ     深振业A     0.54      房地产   3011047
2   2020-12-15  000007.SZ      全新好     0.21      房地产   3011047
3   2020-12-15  000011.SZ     深物业A     0.28      房地产   3011047
4   2020-12-15  000014.SZ     沙河股份     0.12      房地产   3011047
..         ...        ...      ...      ...      ...       ...
97  2020-12-15  600895.SH     张江高科     1.45      房地产   3011047
98  2020-12-15  601155.SH     新城控股     3.31      房地产   3011047
99  2020-12-15  601512.SH     中新集团     0.19       工业   3011047
100 2020-12-15  601588.SH     北辰实业     0.38      房地产   3011047
101 2020-12-15  603506.SH     南都物业     0.09      房地产   3011047

[102 rows x 6 columns]
get index composition weight
0:00:18.559784
Wind data not ready


In [4]:
il

array(['H11030.CSI', 'H11031.CSI', 'H11041.CSI', 'H11042.CSI',
       'H11043.CSI', 'H11044.CSI', 'H11045.CSI', 'H11046.CSI',
       'H11047.CSI', 'H11049.CSI', 'H11050.CSI', 'H30036.CSI',
       'H30037.CSI', 'H30038.CSI', 'H30039.CSI', 'H30040.CSI',
       'H30041.CSI', 'H30042.CSI', 'H30043.CSI', 'H30044.CSI',
       'H30045.CSI', 'H30046.CSI', 'H30047.CSI', 'H30048.CSI',
       'H30049.CSI', 'H30050.CSI', 'H30051.CSI', 'H30052.CSI',
       'H30053.CSI', 'H30054.CSI', 'H30055.CSI', 'H30056.CSI',
       'H30057.CSI', 'H30058.CSI', 'H30059.CSI', 'H30060.CSI',
       'H30061.CSI', 'H30062.CSI', 'H30063.CSI', 'H30064.CSI',
       'H30065.CSI', 'H30066.CSI', 'H30067.CSI'], dtype=object)

In [11]:
test = w.wset("indexconstituent","date=%s; windcode=%s"%(d, 'H11047.CSI'))
np.sort(test.Data[1])

array(['000002.SZ', '000006.SZ', '000007.SZ', '000011.SZ', '000014.SZ',
       '000031.SZ', '000036.SZ', '000069.SZ', '000402.SZ', '000514.SZ',
       '000517.SZ', '000537.SZ', '000540.SZ', '000560.SZ', '000573.SZ',
       '000615.SZ', '000616.SZ', '000620.SZ', '000656.SZ', '000667.SZ',
       '000668.SZ', '000671.SZ', '000691.SZ', '000718.SZ', '000720.SZ',
       '000732.SZ', '000736.SZ', '000797.SZ', '000838.SZ', '000863.SZ',
       '000886.SZ', '000897.SZ', '000909.SZ', '000918.SZ', '000926.SZ',
       '000961.SZ', '000965.SZ', '001914.SZ', '001979.SZ', '002016.SZ',
       '002146.SZ', '002208.SZ', '002244.SZ', '002285.SZ', '002305.SZ',
       '002314.SZ', '002377.SZ', '002968.SZ', '600007.SH', '600048.SH',
       '600064.SH', '600067.SH', '600082.SH', '600094.SH', '600158.SH',
       '600162.SH', '600173.SH', '600185.SH', '600208.SH', '600223.SH',
       '600239.SH', '600246.SH', '600266.SH', '600322.SH', '600325.SH',
       '600340.SH', '600376.SH', '600383.SH', '600393.SH', '6004

In [None]:
from urllib.request import urlretrieve
url = 'http://www.csindex.com.cn/uploads/indices/amac/files/csrccwf.zip'
fileName = savePath + '\\AMAC_' + updateDate + '.zip'
urlretrieve(url, fileName)

In [17]:
data3 = []
startTm = datetime.datetime.now()
stock_list = add['stock_list'].unique()    
for s in stock_list:
    start_date = add[add['stock_list'] == s]['date'].min()
    end_date = add[add['stock_list'] == s]['date'].max()
    add_data = w.wsd(s, "industry_CSRCcode12", start_date, end_date, "industryType=3;PriceAdj=F")
    if add_data.ErrorCode != 0:
        continue
    nd = pd.DataFrame(data=np.array(add_data.Data).T, columns=['Ind'])
    nd1 = pd.DataFrame(data=np.array(add_data.Times).T, columns=['DateTime'])
    nd = pd.concat([nd1, nd], axis=1)
    nd = nd[~nd['Ind'].isnull()]
    if nd.empty:
        continue
    else:
        nd['index_id'] = nd['Ind'].str[1:].astype(int)
        nd['date'] = nd['DateTime'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)
        if s[-2:] == 'SZ':
            nd['skey'] = int(s[:-3]) + 2000000
        else:
            nd['skey'] = int(s[:-3]) + 1000000
        nd['weight'] = 0
        data3 += [nd]
data3 = pd.concat(data3).reset_index(drop=True)
print(datetime.datetime.now() - startTm)
print('get extra data')

KeyboardInterrupt: 

In [13]:
data2 = pd.concat([data2, data3])
data2 = data2.sort_values(by=['date', 'skey', 'weight']).reset_index(drop=True)
if data2[data2.duplicated(['date', 'skey'], keep=False)].shape[0] != 0:
    display(data2[data2.duplicated(['date', 'skey'], keep=False)])
    data2 = data2.drop_duplicates(['date', 'skey'], keep='last').reset_index(drop=True)
assert(data2[data2['index_id'] < 100]['weight'].unique() == [0])

data2['index_id'] = np.where(data2.index_id <= 5, 3011030,np.where(data2.index_id <= 12, 3011031, \
                   np.where(data2.index_id == 13, 3030041,np.where(data2.index_id == 14, 3030042, \
                   np.where(data2.index_id == 15, 3030043,np.where(data2.index_id == 17, 3030044,\
                   np.where(data2.index_id == 18, 3030045,np.where(data2.index_id == 19, 3030046,\
                   np.where(data2.index_id == 20, 3030047,np.where(data2.index_id == 21, 3030048,\
                   np.where(data2.index_id == 22, 3030049,np.where(data2.index_id == 23, 3030050,\
                   np.where(data2.index_id == 24, 3030051,np.where(data2.index_id == 25, 3030052,\
                   np.where(data2.index_id == 26, 3030053,np.where(data2.index_id == 27, 3030054,\
                   np.where(data2.index_id == 28, 3030055,np.where(data2.index_id == 29, 3030056,\
                   np.where(data2.index_id == 30, 3030057,np.where(data2.index_id == 31, 3030058,\
                   np.where(data2.index_id == 32, 3030059,np.where(data2.index_id == 33, 3030060,\
                   np.where(data2.index_id == 34, 3030061,np.where(data2.index_id == 35, 3030062,\
                   np.where(data2.index_id == 36, 3030063,np.where(data2.index_id == 37, 3030064,\
                   np.where(data2.index_id == 38, 3030065,np.where(data2.index_id == 39, 3030066,\
                   np.where(data2.index_id == 40, 3030067,np.where(data2.index_id <= 43, 3011050,\
                   np.where(data2.index_id <= 46, 3011041,np.where(data2.index_id <= 50, 3011042,\
                   np.where(data2.index_id <= 52, 3011045,np.where(data2.index_id <= 60, 3011043,\
                   np.where(data2.index_id <= 62, 3030036,np.where(data2.index_id <= 65, 3011044,\
                   np.where(data2.index_id <= 69, 3011046,np.where(data2.index_id == 70, 3011047,\
                   np.where(data2.index_id <= 72, 3030037,np.where(data2.index_id <= 75, 3030038,\
                   np.where(data2.index_id <= 78, 3030039,np.where(data2.index_id <= 81, 3030040,\
                   np.where(data2.index_id == 82, 3011049,np.where(data2.index_id <= 84, 3030040,\
                   np.where(data2.index_id <= 89, 3011049,np.where(data2.index_id == 90, 3011050, data2['index_id']))))))))))))))))))))))))))))))))))))))))))))))
assert(data2['index_id'].min() > 100)
corrections = {3011030: 'AMAC 农林',    
               3011031: 'AMAC 采矿', 
               3011041: 'AMAC 公用', 
               3011042: 'AMAC 建筑', 
               3011043: 'AMAC 交运',    
               3011044: 'AMAC 信息', 
               3011045: 'AMAC 批零', 
               3011046: 'AMAC 金融', 
               3011047: 'AMAC 地产',    
               3011049: 'AMAC 文体', 
               3011050: 'AMAC 综企', 
               3030036: 'AMAC 餐饮',  
               3030037: 'AMAC 商务',    
               3030038: 'AMAC 科技', 
               3030039: 'AMAC 公共', 
               3030040: 'AMAC 社会',
               3030041: 'AMAC 农副', 
               3030042: 'AMAC 食品', 
               3030043: 'AMAC 饮料', 
               3030044: 'AMAC 纺织',    
               3030045: 'AMAC 服装', 
               3030046: 'AMAC 皮革', 
               3030047: 'AMAC 木材', 
               3030048: 'AMAC 家具',    
               3030049: 'AMAC 造纸', 
               3030050: 'AMAC 印刷', 
               3030051: 'AMAC 文教',  
               3030052: 'AMAC 石化',    
               3030053: 'AMAC 化学', 
               3030054: 'AMAC 医药', 
               3030055: 'AMAC 化纤',  
               3030056: 'AMAC 橡胶', 
               3030057: 'AMAC 矿物', 
               3030058: 'AMAC 钢铁', 
               3030059: 'AMAC 有色',    
               3030060: 'AMAC 金属', 
               3030061: 'AMAC 通用', 
               3030062: 'AMAC 专用', 
               3030063: 'AMAC 汽车',    
               3030064: 'AMAC 运输', 
               3030065: 'AMAC 电气', 
               3030066: 'AMAC 电子',  
               3030067: 'AMAC 仪表'}
data2['index_name'] = data2['index_id']
data2.index_name = data2.index_name.map(corrections)
data2 = data2[['date', 'skey', 'index_id', 'index_name', 'weight']]
try:
    assert(abs(data2.groupby(['date', 'index_id'])['weight'].sum() - 100).max() < 0.1)
except:
    print(data2.groupby(['date', 'index_id'])['weight'].sum())

Unnamed: 0,date,skey,index_id,index_name,weight
0,20201202,2000592,3011030,AMAC 农林,1.3
1,20201202,2000713,3011030,AMAC 农林,0.77
2,20201202,2000735,3011030,AMAC 农林,2.61
3,20201202,2000998,3011030,AMAC 农林,4.49
4,20201202,2002041,3011030,AMAC 农林,1.75
...,...,...,...,...,...
3463,20201202,1603297,3030067,AMAC 仪表,0.95
3464,20201202,1603416,3030067,AMAC 仪表,3.41
3465,20201202,1603556,3030067,AMAC 仪表,2.26
3466,20201202,1603662,3030067,AMAC 仪表,1.09


In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_filter_daily(db, name, start_date=None, end_date=None, skey=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date','skey'])
    return df  

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    


def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df   

In [10]:
database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = dailyDB("192.168.10.178", database_name, user, password)
kk = read_stock_daily(db1, 'mktbeta', 20201102, 20201102)

In [11]:
kk.dtypes

skey                 float64
date                 float64
time                 float64
beta_10d_IF          float64
beta_60d_IF          float64
beta_10d_IC          float64
beta_60d_IC          float64
beta_10d_CSI1000     float64
beta_60d_CSI1000     float64
alpha_10d_IF         float64
alpha_60d_IF         float64
alpha_10d_IC         float64
alpha_60d_IC         float64
alpha_10d_CSI1000    float64
alpha_60d_CSI1000    float64
dtype: object