In [1]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    
    return query

def write_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            symbol = int(symbol)
            print(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['index_id'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 

def write_data1(db, name, df):
    collection = db[name]
    assert(len(set(df['skey'].unique()) & set(pd.DataFrame.from_records(collection.find({'date' : {'$in': [int(df['date'].iloc[0])]}}))['skey'].unique())) == 0)
    df = df.to_dict('records')
    collection.insert_many(df) 
    
def write_weight_data(db, name, df, index_id):
    collection = db[name]
    df = df[df['index_id'] == index_id]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        weight = sub_df['weight'].values[0]
        print(weight)
        collection.update({'skey': skey, 'date':date, 'index_id':index_id}, {'$set':{'weight':float(weight)}})

def delete_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
 
def read_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    




database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

In [None]:
from WindPy import *
w.start()


import os
import glob
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',200)
pd.options.mode.chained_assignment = None
import numpy as np

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

start_date = '2017-02-10'
end_date = '2017-02-18'

def get_date_list(begin_date,end_date):
    date_list = [int(x.strftime('%Y%m%d')) for x in list(pd.date_range(start=begin_date, end=end_date))]
    return date_list
dl = get_date_list(start_date,end_date)
il = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
il['StockID'] = il['StockID'].str[3:] + '.CSI'
il = il['StockID'].values

data2 = []
add = []
for d in dl:
    data1 = []
    for i in il:
        data = w.wset("indexconstituent","date=%s; windcode=%s"%(d, i))
        df = pd.DataFrame(data=np.array(data.Data).T, columns=data.Fields)
        df['index_id'] = 3000000 + int(i[1:6])
        assert(abs(df['i_weight'].sum() - 100) < 0.2)
        df['skey'] = df['wind_code'].str[:-3].astype(int)
        df['skey'] = np.where(df['skey'] < 600000, df['skey'] + 2000000, df['skey'] + 1000000)
        df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
        corrections = {3011030: 'AMAC 农林',    
               3011031: 'AMAC 采矿', 
               3011041: 'AMAC 公用', 
               3011042: 'AMAC 建筑', 
               3011043: 'AMAC 交运',    
               3011044: 'AMAC 信息', 
               3011045: 'AMAC 批零', 
               3011046: 'AMAC 金融', 
               3011047: 'AMAC 地产',    
               3011049: 'AMAC 文体', 
               3011050: 'AMAC 综企', 
               3030036: 'AMAC 餐饮',  
               3030037: 'AMAC 商务',    
               3030038: 'AMAC 科技', 
               3030039: 'AMAC 公共', 
               3030040: 'AMAC 社会',
               3030041: 'AMAC 农副', 
               3030042: 'AMAC 食品', 
               3030043: 'AMAC 饮料', 
               3030044: 'AMAC 纺织',    
               3030045: 'AMAC 服装', 
               3030046: 'AMAC 皮革', 
               3030047: 'AMAC 木材', 
               3030048: 'AMAC 家具',    
               3030049: 'AMAC 造纸', 
               3030050: 'AMAC 印刷', 
               3030051: 'AMAC 文教',  
               3030052: 'AMAC 石化',    
               3030053: 'AMAC 化学', 
               3030054: 'AMAC 医药', 
               3030055: 'AMAC 化纤',  
               3030056: 'AMAC 橡胶', 
               3030057: 'AMAC 矿物', 
               3030058: 'AMAC 钢铁', 
               3030059: 'AMAC 有色',    
               3030060: 'AMAC 金属', 
               3030061: 'AMAC 通用', 
               3030062: 'AMAC 专用', 
               3030063: 'AMAC 汽车',    
               3030064: 'AMAC 运输', 
               3030065: 'AMAC 电气', 
               3030066: 'AMAC 电子',  
               3030067: 'AMAC 仪表'}
        df['index_name'] = df['index_id']
        df.index_name = df.index_name.map(corrections)
        data1 += [df]
    data1 = pd.concat(data1).reset_index(drop=True)
    sl = pd.read_csv(r'D:\work\project 17 AMAC\AShare20160101_20200903.csv')
    sl['skey'] = np.where(sl['StockID'].str[:2] == 'SH', sl['StockID'].str[2:].astype(int) + 1000000, 
                        sl['StockID'].str[2:].astype(int) + 2000000)
    stock_list = list(set(sl['skey'].unique()) - set(data1['skey'].unique()))
    stock_list =[str(i - 1000000).rjust(6, "0") + '.SH' if i < 2000000 else str(i - 2000000).rjust(6, "0") + '.SZ' for i in stock_list]
    dd = str(d)[:4] + '-' + str(d)[4:6] + '-' + str(d)[6:]
    add1 = pd.DataFrame(columns=['date', 'stock_list'])
    add1['stock_list'] = stock_list
    add1['date'] = dd
    add += [add1]
    data1 = data1.rename(columns={'i_weight': 'weight'})
    data1 = data1[['date', 'skey', 'index_id', 'index_name', 'weight']]
    data2 += [data1]
data2 = pd.concat(data2).reset_index(drop=True)
add = pd.concat(add).reset_index(drop=True)
    
stock_list = add['stock_list'].unique() 
data3 = []
startTm = datetime.datetime.now()
for s in stock_list:
    for da_te in add[add['stock_list'] == s]['date'].unique():
        add_data = w.wsd(s, "industry_CSRCcode12", da_te, da_te, "industryType=3;PriceAdj=F")
        if add_data.ErrorCode != 0:
            continue
        nd = pd.DataFrame(data=np.array(add_data.Data).T, columns=['Ind'])
        nd1 = pd.DataFrame(data=np.array(add_data.Times).T, columns=['DateTime'])
        nd = pd.concat([nd1, nd], axis=1)
        nd = nd[~nd['Ind'].isnull()]
        if nd.empty:
            continue
        else:
            nd['index_id'] = nd['Ind'].str[1:].astype(int)
            nd['date'] = nd['DateTime'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)
            if s[-2:] == 'SZ':
                nd['skey'] = int(s[:-3]) + 2000000
            else:
                nd['skey'] = int(s[:-3]) + 1000000
            nd['weight'] = 0
            data3 += [nd]
data3 = pd.concat(data3).reset_index(drop=True)
print(datetime.datetime.now() - startTm)
data2 = pd.concat([data2, data3])
data3

In [1]:
from WindPy import *
w.start()


import os
import glob
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',200)
pd.options.mode.chained_assignment = None
import numpy as np

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

il = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
il['StockID'] = il['StockID'].str[3:] + '.CSI'
il = il['StockID'].values

startDate = '20200814'
endDate = '20200912'

readPath = r'F:\Download\StockFactors'
dataPathLs = np.array(glob.glob(os.path.join(readPath, 'FactorData***.csv')))
dateLs = np.array([os.path.basename(i).split('_')[1] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
total_stock = []
for thisDate, thisPath in zip(dateLs, dataPathLs):
    data = pd.read_csv(thisPath, encoding='GBK')
    total_stock += [data]
total_stock = pd.concat(total_stock, sort=False)
total_stock['skey'] = np.where(total_stock['Symbol'].str[:2] == 'SH', total_stock['Symbol'].str[2:].astype(int) + 1000000, 
                              total_stock['Symbol'].str[2:].astype(int) + 2000000)
dl = total_stock['Date'].unique()

data2 = []
add = []
startTm = datetime.datetime.now()
for d in dl:
    data1 = []
    for i in il:
        data = w.wset("indexconstituent","date=%s; windcode=%s"%(d, i))
        df = pd.DataFrame(data=np.array(data.Data).T, columns=data.Fields)
        df['index_id'] = 3000000 + int(i[1:6])
        assert(abs(df['i_weight'].sum() - 100) < 0.2)
        df['skey'] = df['wind_code'].str[:-3].astype(int)
        df['skey'] = np.where(df['skey'] < 600000, df['skey'] + 2000000, df['skey'] + 1000000)
        df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
        corrections = {3011030: 'AMAC 农林',    
               3011031: 'AMAC 采矿', 
               3011041: 'AMAC 公用', 
               3011042: 'AMAC 建筑', 
               3011043: 'AMAC 交运',    
               3011044: 'AMAC 信息', 
               3011045: 'AMAC 批零', 
               3011046: 'AMAC 金融', 
               3011047: 'AMAC 地产',    
               3011049: 'AMAC 文体', 
               3011050: 'AMAC 综企', 
               3030036: 'AMAC 餐饮',  
               3030037: 'AMAC 商务',    
               3030038: 'AMAC 科技', 
               3030039: 'AMAC 公共', 
               3030040: 'AMAC 社会',
               3030041: 'AMAC 农副', 
               3030042: 'AMAC 食品', 
               3030043: 'AMAC 饮料', 
               3030044: 'AMAC 纺织',    
               3030045: 'AMAC 服装', 
               3030046: 'AMAC 皮革', 
               3030047: 'AMAC 木材', 
               3030048: 'AMAC 家具',    
               3030049: 'AMAC 造纸', 
               3030050: 'AMAC 印刷', 
               3030051: 'AMAC 文教',  
               3030052: 'AMAC 石化',    
               3030053: 'AMAC 化学', 
               3030054: 'AMAC 医药', 
               3030055: 'AMAC 化纤',  
               3030056: 'AMAC 橡胶', 
               3030057: 'AMAC 矿物', 
               3030058: 'AMAC 钢铁', 
               3030059: 'AMAC 有色',    
               3030060: 'AMAC 金属', 
               3030061: 'AMAC 通用', 
               3030062: 'AMAC 专用', 
               3030063: 'AMAC 汽车',    
               3030064: 'AMAC 运输', 
               3030065: 'AMAC 电气', 
               3030066: 'AMAC 电子',  
               3030067: 'AMAC 仪表'}
        df['index_name'] = df['index_id']
        df.index_name = df.index_name.map(corrections)
        data1 += [df]
    data1 = pd.concat(data1).reset_index(drop=True)
    stock_list = list(set(total_stock[total_stock['Date'] == d]['skey'].unique()) - set(data1['skey'].unique()))
    stock_list =[str(i - 1000000).rjust(6, "0") + '.SH' if i < 2000000 else str(i - 2000000).rjust(6, "0") + '.SZ' for i in stock_list]
    dd = str(d)[:4] + '-' + str(d)[4:6] + '-' + str(d)[6:]
    add1 = pd.DataFrame(columns=['date', 'stock_list'])
    add1['stock_list'] = stock_list
    add1['date'] = dd
    add += [add1]
    data1 = data1.rename(columns={'i_weight': 'weight'})
    data1 = data1[['date', 'skey', 'index_id', 'index_name', 'weight']]
    data2 += [data1]
data2 = pd.concat(data2).reset_index(drop=True)
data2 = data2.drop_duplicates(keep='first')
add = pd.concat(add).reset_index(drop=True)
print('get index composition weight')
print(datetime.datetime.now() - startTm)

data3 = []
startTm = datetime.datetime.now()
stock_list = add['stock_list'].unique()    
for s in stock_list:
    start_date = add[add['stock_list'] == s]['date'].min()
    end_date = add[add['stock_list'] == s]['date'].max()
    add_data = w.wsd(s, "industry_CSRCcode12", start_date, end_date, "industryType=3;PriceAdj=F")
    if add_data.ErrorCode != 0:
        continue
    nd = pd.DataFrame(data=np.array(add_data.Data).T, columns=['Ind'])
    nd1 = pd.DataFrame(data=np.array(add_data.Times).T, columns=['DateTime'])
    nd = pd.concat([nd1, nd], axis=1)
    nd = nd[~nd['Ind'].isnull()]
    if nd.empty:
        continue
    else:
        nd['index_id'] = nd['Ind'].str[1:].astype(int)
        nd['date'] = nd['DateTime'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)
        if s[-2:] == 'SZ':
            nd['skey'] = int(s[:-3]) + 2000000
        else:
            nd['skey'] = int(s[:-3]) + 1000000
        nd['weight'] = 0
        data3 += [nd]
data3 = pd.concat(data3).reset_index(drop=True)
print(datetime.datetime.now() - startTm)
print('get extra data')

data2 = pd.concat([data2, data3])
data2 = data2.sort_values(by=['date', 'skey', 'weight']).reset_index(drop=True)
if data2[data2.duplicated(['date', 'skey'], keep=False)].shape[0] != 0:
    display(data2[data2.duplicated(['date', 'skey'], keep=False)])
    data2 = data2.drop_duplicates(['date', 'skey'], keep='last').reset_index(drop=True)
assert(data2[data2['index_id'] < 100]['weight'].unique() == [0])

data2['index_id'] = np.where(data2.index_id <= 5, 3011030,np.where(data2.index_id <= 12, 3011031, \
                   np.where(data2.index_id == 13, 3030041,np.where(data2.index_id == 14, 3030042, \
                   np.where(data2.index_id == 15, 3030043,np.where(data2.index_id == 17, 3030044,\
                   np.where(data2.index_id == 18, 3030045,np.where(data2.index_id == 19, 3030046,\
                   np.where(data2.index_id == 20, 3030047,np.where(data2.index_id == 21, 3030048,\
                   np.where(data2.index_id == 22, 3030049,np.where(data2.index_id == 23, 3030050,\
                   np.where(data2.index_id == 24, 3030051,np.where(data2.index_id == 25, 3030052,\
                   np.where(data2.index_id == 26, 3030053,np.where(data2.index_id == 27, 3030054,\
                   np.where(data2.index_id == 28, 3030055,np.where(data2.index_id == 29, 3030056,\
                   np.where(data2.index_id == 30, 3030057,np.where(data2.index_id == 31, 3030058,\
                   np.where(data2.index_id == 32, 3030059,np.where(data2.index_id == 33, 3030060,\
                   np.where(data2.index_id == 34, 3030061,np.where(data2.index_id == 35, 3030062,\
                   np.where(data2.index_id == 36, 3030063,np.where(data2.index_id == 37, 3030064,\
                   np.where(data2.index_id == 38, 3030065,np.where(data2.index_id == 39, 3030066,\
                   np.where(data2.index_id == 40, 3030067,np.where(data2.index_id <= 43, 3011050,\
                   np.where(data2.index_id <= 46, 3011041,np.where(data2.index_id <= 50, 3011042,\
                   np.where(data2.index_id <= 52, 3011045,np.where(data2.index_id <= 60, 3011043,\
                   np.where(data2.index_id <= 62, 3030036,np.where(data2.index_id <= 65, 3011044,\
                   np.where(data2.index_id <= 69, 3011046,np.where(data2.index_id == 70, 3011047,\
                   np.where(data2.index_id <= 72, 3030037,np.where(data2.index_id <= 75, 3030038,\
                   np.where(data2.index_id <= 78, 3030039,np.where(data2.index_id <= 81, 3030040,\
                   np.where(data2.index_id == 82, 3011049,np.where(data2.index_id <= 84, 3030040,\
                   np.where(data2.index_id <= 89, 3011049,np.where(data2.index_id == 90, 3011050, data2['index_id']))))))))))))))))))))))))))))))))))))))))))))))
assert(data2['index_id'].min() > 100)
corrections = {3011030: 'AMAC 农林',    
               3011031: 'AMAC 采矿', 
               3011041: 'AMAC 公用', 
               3011042: 'AMAC 建筑', 
               3011043: 'AMAC 交运',    
               3011044: 'AMAC 信息', 
               3011045: 'AMAC 批零', 
               3011046: 'AMAC 金融', 
               3011047: 'AMAC 地产',    
               3011049: 'AMAC 文体', 
               3011050: 'AMAC 综企', 
               3030036: 'AMAC 餐饮',  
               3030037: 'AMAC 商务',    
               3030038: 'AMAC 科技', 
               3030039: 'AMAC 公共', 
               3030040: 'AMAC 社会',
               3030041: 'AMAC 农副', 
               3030042: 'AMAC 食品', 
               3030043: 'AMAC 饮料', 
               3030044: 'AMAC 纺织',    
               3030045: 'AMAC 服装', 
               3030046: 'AMAC 皮革', 
               3030047: 'AMAC 木材', 
               3030048: 'AMAC 家具',    
               3030049: 'AMAC 造纸', 
               3030050: 'AMAC 印刷', 
               3030051: 'AMAC 文教',  
               3030052: 'AMAC 石化',    
               3030053: 'AMAC 化学', 
               3030054: 'AMAC 医药', 
               3030055: 'AMAC 化纤',  
               3030056: 'AMAC 橡胶', 
               3030057: 'AMAC 矿物', 
               3030058: 'AMAC 钢铁', 
               3030059: 'AMAC 有色',    
               3030060: 'AMAC 金属', 
               3030061: 'AMAC 通用', 
               3030062: 'AMAC 专用', 
               3030063: 'AMAC 汽车',    
               3030064: 'AMAC 运输', 
               3030065: 'AMAC 电气', 
               3030066: 'AMAC 电子',  
               3030067: 'AMAC 仪表'}
data2['index_name'] = data2['index_id']
data2.index_name = data2.index_name.map(corrections)
data2 = data2[['date', 'skey', 'index_id', 'index_name', 'weight']]
assert(abs(data2.groupby(['date', 'index_id'])['weight'].sum() - 100).max() < 0.2)

Welcome to use Wind Quant API for Python (WindPy)!

COPYRIGHT (C) 2020 WIND INFORMATION CO., LTD. ALL RIGHTS RESERVED.
IN NO CIRCUMSTANCE SHALL WIND BE RESPONSIBLE FOR ANY DAMAGES OR LOSSES CAUSED BY USING WIND QUANT API FOR Python.
get index composition weight
0:06:35.820384
0:04:17.571555
get extra data


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [34]:
a = add.groupby('stock_list')['date'].unique().reset_index()
a['date'] = a['date'].apply(lambda x: tuple(i for i in x))
a.groupby('date')['stock_list'].unique()

date
(2020-08-01, 2020-08-02)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          [300848.SZ]
(2020-08-01, 2020-08-02, 2020-08-03)                                                                                                                                                                                                                                                                                                                                                                                                                 

In [2]:
his_data = pd.read_pickle(r'G:\AMAC_weight.pkl')
write_data(db1, 'index_memb', his_data)

3011046
           date     skey  index_id index_name weight
0      20200814  1600000   3011046    AMAC 金融   2.37
9      20200814  1600015   3011046    AMAC 金融   0.76
10     20200814  1600016   3011046    AMAC 金融   2.28
23     20200814  1600030   3011046    AMAC 金融   5.09
27     20200814  1600036   3011046    AMAC 金融   7.47
...         ...      ...       ...        ...    ...
82225  20200911  2300023   3011046    AMAC 金融      0
82234  20200911  2300033   3011046    AMAC 金融   0.63
82260  20200911  2300059   3011046    AMAC 金融   3.17
82505  20200911  2300309   3011046    AMAC 金融   0.04
82980  20200911  2300803   3011046    AMAC 金融   0.06

[2478 rows x 5 columns]
3011043
           date     skey  index_id index_name weight
1      20200814  1600004   3011043    AMAC 交运   2.01
5      20200814  1600009   3011043    AMAC 交运   9.36
8      20200814  1600012   3011043    AMAC 交运   0.24
11     20200814  1600017   3011043    AMAC 交运   0.67
12     20200814  1600018   3011043    AMAC 交运   2.03
...  

3030039
           date     skey  index_id index_name weight
36     20200814  1600054   3030039    AMAC 公共   1.15
244    20200814  1600292   3030039    AMAC 公共   1.12
271    20200814  1600323   3030039    AMAC 公共   6.32
480    20200814  1600593   3030039    AMAC 公共   2.12
576    20200814  1600706   3030039    AMAC 公共   0.44
...         ...      ...       ...        ...    ...
82990  20200911  2300815   3030039    AMAC 公共   2.38
82991  20200911  2300816   3030039    AMAC 公共   1.05
83031  20200911  2300859   3030039    AMAC 公共    0.8
83036  20200911  2300864   3030039    AMAC 公共   0.77
83039  20200911  2300867   3030039    AMAC 公共   1.36

[1332 rows x 5 columns]
3030054
           date     skey  index_id index_name weight
38     20200814  1600056   3030054    AMAC 医药   0.36
44     20200814  1600062   3030054    AMAC 医药    0.3
59     20200814  1600079   3030054    AMAC 医药   1.44
60     20200814  1600080   3030054    AMAC 医药      0
65     20200814  1600085   3030054    AMAC 医药   0.69
...  

3030041
           date     skey  index_id index_name weight
73     20200814  1600095   3030041    AMAC 农副   0.86
103    20200814  1600127   3030041    AMAC 农副   1.34
158    20200814  1600191   3030041    AMAC 农副   0.29
210    20200814  1600251   3030041    AMAC 农副   0.69
228    20200814  1600275   3030041    AMAC 农副      0
...         ...      ...       ...        ...    ...
82195  20200911  2002991   3030041    AMAC 农副   0.69
82336  20200911  2300138   3030041    AMAC 农副   1.19
82372  20200911  2300175   3030041    AMAC 农副   0.57
82464  20200911  2300268   3030041    AMAC 农副   0.28
82860  20200911  2300673   3030041    AMAC 农副   0.34

[1008 rows x 5 columns]
3011030
Empty DataFrame
Columns: [date, skey, index_id, index_name, weight]
Index: []
3030045
           date     skey  index_id index_name weight
84     20200814  1600107   3030045    AMAC 服装   3.21
112    20200814  1600137   3030045    AMAC 服装      0
118    20200814  1600146   3030045    AMAC 服装      0
145    20200814  1600177 

3030050
           date     skey  index_id index_name weight
695    20200814  1600836   3030050    AMAC 印刷      0
902    20200814  1601515   3030050    AMAC 印刷    9.5
1049   20200814  1603058   3030050    AMAC 印刷      0
1234   20200814  1603429   3030050    AMAC 印刷   17.3
1245   20200814  1603499   3030050    AMAC 印刷   2.43
...         ...      ...       ...        ...    ...
81429  20200911  2002191   3030050    AMAC 印刷  29.94
81466  20200911  2002229   3030050    AMAC 印刷   8.84
81829  20200911  2002599   3030050    AMAC 印刷    4.3
82052  20200911  2002836   3030050    AMAC 印刷   1.91
82158  20200911  2002951   3030050    AMAC 印刷   2.53

[273 rows x 5 columns]
3030048
           date     skey  index_id index_name weight
784    20200814  1600978   3030048    AMAC 家具      0
1011   20200814  1603008   3030048    AMAC 家具   3.81
1114   20200814  1603180   3030048    AMAC 家具   2.01
1130   20200814  1603208   3030048    AMAC 家具   6.55
1176   20200814  1603313   3030048    AMAC 家具    3.8
...   

In [10]:
read_daily(db1, 'index_memb', 20200911, 20200911, index_id=[3030049])

Unnamed: 0,date,skey,index_id,index_name,weight
0,20200911,1600103,3030049,AMAC 造纸,3.15
1,20200911,1600235,3030049,AMAC 造纸,1.02
2,20200911,1600308,3030049,AMAC 造纸,3.27
3,20200911,1600356,3030049,AMAC 造纸,1.49
4,20200911,1600433,3030049,AMAC 造纸,2.71
5,20200911,1600567,3030049,AMAC 造纸,8.31
6,20200911,1600793,3030049,AMAC 造纸,0.65
7,20200911,1600963,3030049,AMAC 造纸,3.04
8,20200911,1600966,3030049,AMAC 造纸,7.66
9,20200911,1603022,3030049,AMAC 造纸,1.07


### Write historical data

In [53]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    
    return query

def write_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            symbol = int(symbol)
            print(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['index_id'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 

def write_data1(db, name, df):
    collection = db[name]
    assert(len(set(df['skey'].unique()) & set(pd.DataFrame.from_records(collection.find({'date' : {'$in': [int(df['date'].iloc[0])]}, 'index_id':{'$gte': 3000000}}))['skey'].unique())) == 0)
    df = df.to_dict('records')
    collection.insert_many(df) 
    
def write_weight_data(db, name, df, index_id):
    collection = db[name]
    df = df[df['index_id'] == index_id]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        weight = sub_df['weight'].values[0]
        collection.update({'skey': skey, 'date':date, 'index_id':index_id}, {'$set':{'weight':float(weight)}})

def remove_wrong_index_data(db, name, df):
    collection = db[name]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        collection.remove({'skey': skey, 'date':date, 'index_id':{'$gte':3000000}})

def delete_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
 
def read_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    


from WindPy import *
w.start()


import os
import glob
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',200)
pd.options.mode.chained_assignment = None
import numpy as np

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

readPath = 'L:\ShareWithServer\day_stock_20200820\***'
dataPathLs = np.array(glob.glob(readPath))
db = []
for p in dataPathLs:
    dayData = pd.read_csv(p, compression='gzip')
    db += [dayData]
db = pd.concat(db).reset_index(drop=True)
db['skey'] = np.where(db['ID'].str[:2] == 'SH', db['ID'].str[2:].astype(int) + 1000000, db['ID'].str[2:].astype(int) + 2000000)


pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)
kk = read_daily(db1, 'index_memb', 20170901, 20190522, index_id = [3011030])
date_list = np.sort(kk['date'].unique())
date_list = date_list[::-1]
index_list = db1['index_memb'].distinct('index_id')
index_list = ['H' + str(i)[2:] + '.CSI' for i in index_list if i > 3000000]
for d in date_list:
    data2 = []
    df1 = []
    add = []
    startTm = datetime.datetime.now()
    for i in index_list:
        data = w.wset("indexconstituent","date=%s; windcode=%s"%(d, i))
        df = pd.DataFrame(data=np.array(data.Data).T, columns=data.Fields)
        df['index_id'] = 3000000 + int(i[1:6])
        try:
            assert(abs(df['i_weight'].sum() - 100) < 0.2)
        except:
            print(abs(df['i_weight'].sum() - 100))
        df['skey'] = df['wind_code'].str[:-3].astype(int)
        df['skey'] = np.where(df['skey'] < 600000, df['skey'] + 2000000, df['skey'] + 1000000)
        df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
        df = df.rename(columns={'i_weight':"weight"})
        try:
            assert(len(set(df['skey'].unique()) - set(read_daily(db1, 'index_memb', int(d), int(d), 
                                                             index_id=[int(df['index_id'].iloc[0])])['skey'].unique())) == 0)
        except:
            add += [df[df['skey'].isin(set(df['skey'].unique()) - set(read_daily(db1, 'index_memb', int(d), int(d), 
                                                             index_id=[int(df['index_id'].iloc[0])])['skey'].unique()))][['date', 'skey', 'index_id', 'weight']]]
        try:
            assert(df[(df['weight'].isnull()) | (df['weight'] == 0)].shape[0] == 0)
        except:
            print(df[(df['weight'].isnull()) | (df['weight'] == 0)])
        df = df.fillna(0)
        write_weight_data(db1, 'index_memb', df, 3000000 + int(i[1:6]))
    if len(add) != 0:
        add = pd.concat(add).reset_index(drop=True)
        print(add)
    
    oridb = read_daily(db1, 'index_memb', int(d), int(d))
    da_te = str(d)[:4] + '-' + str(d)[4:6] + '-' + str(d)[6:8]
    stock_list = set(db[db['date'] == da_te]['skey'].unique()) - set(oridb[oridb['index_id'] > 3000000]['skey'].unique())
    stock_list =[str(i - 1000000).rjust(6, "0") + '.SH' if i < 2000000 else str(i - 2000000).rjust(6, "0") + '.SZ' for i in stock_list]
    if len(add) != 0:
        remove_wrong_index_data(db1, 'index_memb', add)
    
    for s in stock_list:
        add_data = w.wsd(s, "industry_CSRCcode12", da_te, da_te, "industryType=3;PriceAdj=F")
        assert(add_data.ErrorCode == 0)
        nd = pd.DataFrame(data=np.array(add_data.Data).T, columns=['Ind'])
        nd1 = pd.DataFrame(data=np.array(add_data.Times).T, columns=['DateTime'])
        nd = pd.concat([nd1, nd], axis=1)
        nd = nd[~nd['Ind'].isnull()]
        assert(nd.empty == False)
        nd['index_id'] = nd['Ind'].str[1:].astype(int)
        nd['date'] = nd['DateTime'].astype(str).apply(lambda x: x.replace('-', '')).astype(int)
        if s[-2:] == 'SZ':
            nd['skey'] = int(s[:-3]) + 2000000
        else:
            nd['skey'] = int(s[:-3]) + 1000000
        nd['weight'] = 0
        data2 += [nd]
    data2 = pd.concat(data2).reset_index(drop=True)
    assert(data2['weight'].unique() == [0])
    if len(add) != 0:
        assert(add['weight'].min() > 0)
        data2 = pd.concat([data2, add]).reset_index(drop=True)

    data2['index_id'] = np.where(data2.index_id <= 5, 3011030,np.where(data2.index_id <= 12, 3011031, \
                       np.where(data2.index_id == 13, 3030041,np.where(data2.index_id == 14, 3030042, \
                       np.where(data2.index_id == 15, 3030043,np.where(data2.index_id == 17, 3030044,\
                       np.where(data2.index_id == 18, 3030045,np.where(data2.index_id == 19, 3030046,\
                       np.where(data2.index_id == 20, 3030047,np.where(data2.index_id == 21, 3030048,\
                       np.where(data2.index_id == 22, 3030049,np.where(data2.index_id == 23, 3030050,\
                       np.where(data2.index_id == 24, 3030051,np.where(data2.index_id == 25, 3030052,\
                       np.where(data2.index_id == 26, 3030053,np.where(data2.index_id == 27, 3030054,\
                       np.where(data2.index_id == 28, 3030055,np.where(data2.index_id == 29, 3030056,\
                       np.where(data2.index_id == 30, 3030057,np.where(data2.index_id == 31, 3030058,\
                       np.where(data2.index_id == 32, 3030059,np.where(data2.index_id == 33, 3030060,\
                       np.where(data2.index_id == 34, 3030061,np.where(data2.index_id == 35, 3030062,\
                       np.where(data2.index_id == 36, 3030063,np.where(data2.index_id == 37, 3030064,\
                       np.where(data2.index_id == 38, 3030065,np.where(data2.index_id == 39, 3030066,\
                       np.where(data2.index_id == 40, 3030067,np.where(data2.index_id <= 43, 3011050,\
                       np.where(data2.index_id <= 46, 3011041,np.where(data2.index_id <= 50, 3011042,\
                       np.where(data2.index_id <= 52, 3011045,np.where(data2.index_id <= 60, 3011043,\
                       np.where(data2.index_id <= 62, 3030036,np.where(data2.index_id <= 65, 3011044,\
                       np.where(data2.index_id <= 69, 3011046,np.where(data2.index_id == 70, 3011047,\
                       np.where(data2.index_id <= 72, 3030037,np.where(data2.index_id <= 75, 3030038,\
                       np.where(data2.index_id <= 78, 3030039,np.where(data2.index_id <= 81, 3030040,\
                       np.where(data2.index_id == 82, 3011049,np.where(data2.index_id <= 84, 3030040,\
                       np.where(data2.index_id <= 89, 3011049,np.where(data2.index_id == 90, 3011050, data2['index_id']))))))))))))))))))))))))))))))))))))))))))))))
    assert(data2['index_id'].min() > 100)
    corrections = {3011030: 'AMAC 农林',    
                   3011031: 'AMAC 采矿', 
                   3011041: 'AMAC 公用', 
                   3011042: 'AMAC 建筑', 
                   3011043: 'AMAC 交运',    
                   3011044: 'AMAC 信息', 
                   3011045: 'AMAC 批零', 
                   3011046: 'AMAC 金融', 
                   3011047: 'AMAC 地产',    
                   3011049: 'AMAC 文体', 
                   3011050: 'AMAC 综企', 
                   3030036: 'AMAC 餐饮',  
                   3030037: 'AMAC 商务',    
                   3030038: 'AMAC 科技', 
                   3030039: 'AMAC 公共', 
                   3030040: 'AMAC 社会',
                   3030041: 'AMAC 农副', 
                   3030042: 'AMAC 食品', 
                   3030043: 'AMAC 饮料', 
                   3030044: 'AMAC 纺织',    
                   3030045: 'AMAC 服装', 
                   3030046: 'AMAC 皮革', 
                   3030047: 'AMAC 木材', 
                   3030048: 'AMAC 家具',    
                   3030049: 'AMAC 造纸', 
                   3030050: 'AMAC 印刷', 
                   3030051: 'AMAC 文教',  
                   3030052: 'AMAC 石化',    
                   3030053: 'AMAC 化学', 
                   3030054: 'AMAC 医药', 
                   3030055: 'AMAC 化纤',  
                   3030056: 'AMAC 橡胶', 
                   3030057: 'AMAC 矿物', 
                   3030058: 'AMAC 钢铁', 
                   3030059: 'AMAC 有色',    
                   3030060: 'AMAC 金属', 
                   3030061: 'AMAC 通用', 
                   3030062: 'AMAC 专用', 
                   3030063: 'AMAC 汽车',    
                   3030064: 'AMAC 运输', 
                   3030065: 'AMAC 电气', 
                   3030066: 'AMAC 电子',  
                   3030067: 'AMAC 仪表'}
    data2['index_name'] = data2['index_id']
    data2.index_name = data2.index_name.map(corrections)
    data2 = data2[['date', 'skey', 'index_id', 'index_name', 'weight']].sort_values(by=['date', 'index_id', 'skey'])

    write_data1(db1, 'index_memb', data2)
    print(str(d) + ' finished')
    print(datetime.datetime.now() - startTm)



0.3900000000000148
20190522 finished
0:00:35.878751
20190521 finished
0:00:39.388103
20190520 finished
0:00:32.619234
20190517 finished
0:00:35.935723
20190516 finished
0:00:35.809540
20190515 finished
0:00:34.666145
20190514 finished
0:00:35.766539
20190513 finished
0:00:38.326913
20190510 finished
0:00:33.189897
20190509 finished
0:00:36.757618
20190508 finished
0:00:32.572244
20190507 finished
0:00:33.399833
20190506 finished
0:00:32.536880
20190430 finished
0:00:29.456119
20190429 finished
0:00:33.976400
       date     skey  index_id weight
0  20190426  2300483   3011031   0.25
1  20190426  2300464   3011045   0.13
2  20190426  2002607   3011049   2.66
3  20190426  2000779   3030038   1.36
4  20190426  2000967   3030039   2.13
5  20190426  1603299   3030042   0.46
6  20190426  2000534   3030054    0.2
7  20190426  2002755   3030054   0.16
8  20190426  2000584   3030062   0.48
9  20190426  2300428   3030063   0.15


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




20190426 finished
0:00:36.394625
       date     skey  index_id weight
0  20190425  2300483   3011031   0.25
1  20190425  2300464   3011045   0.13
2  20190425  2002607   3011049   2.63
3  20190425  2000779   3030038   1.41
4  20190425  2000967   3030039   2.05
5  20190425  1603299   3030042   0.46
6  20190425  2000534   3030054   0.21
7  20190425  2002755   3030054   0.16
8  20190425  2000584   3030062   0.47
9  20190425  2300428   3030063   0.15
20190425 finished
0:00:33.776691
       date     skey  index_id weight
0  20190424  2300483   3011031   0.24
1  20190424  2300464   3011045   0.13
2  20190424  2002607   3011049   2.58
3  20190424  2000779   3030038   1.37
4  20190424  2000967   3030039   2.05
5  20190424  1603299   3030042   0.48
6  20190424  2000534   3030054   0.21
7  20190424  2002755   3030054   0.16
8  20190424  2000584   3030062   0.49
9  20190424  2300428   3030063   0.15
20190424 finished
0:00:32.751948
       date     skey  index_id weight
0  20190423  2300483   3011

20180905 finished
0:00:34.756820
20180904 finished
0:00:36.329032
20180903 finished
0:00:34.735782
20180831 finished
0:00:33.711684
20180830 finished
0:00:35.555470
20180829 finished
0:00:32.657170
20180828 finished
0:00:35.138014
20180827 finished
0:00:32.006620
20180824 finished
0:00:29.999748
20180823 finished
0:00:30.339208
20180822 finished
0:00:34.539798
20180821 finished
0:00:35.925002
20180820 finished
0:00:38.876627
20180817 finished
0:00:34.479121
20180816 finished
0:00:35.208523
20180815 finished
0:00:35.112488
20180814 finished
0:00:31.048646
20180813 finished
0:00:30.018371
       date     skey  index_id weight
0  20180810  1600293   3011045   0.28
20180810 finished
0:00:33.563878
       date     skey  index_id weight
0  20180809  1600293   3011045   0.28
20180809 finished
0:00:33.361401
20180808 finished
0:00:31.905668
20180807 finished
0:00:32.630957
20180806 finished
0:00:35.257556
        date     skey  index_id weight
0   20180803  1603619   3011031   0.25
1   2018080

20180305 finished
0:00:30.933368
       date     skey  index_id weight
0  20180302  1601360   3011044   1.71
20180302 finished
0:00:35.403519
       date     skey  index_id weight
0  20180301  1601360   3011044   1.67
20180301 finished
0:00:32.221581
       date     skey  index_id weight
0  20180228  1601360   3011044   1.87
20180228 finished
0:00:31.867119
       date     skey  index_id weight
0  20180227  1601360   3011044   1.04
20180227 finished
0:00:40.603818
1.080000000000041
20180226 finished
0:00:36.780545
20180223 finished
0:00:31.811543
20180222 finished
0:00:29.819219
20180214 finished
0:00:34.600084
20180213 finished
0:00:30.656802
20180212 finished
0:00:35.194588
        date     skey  index_id weight
0   20180209  2000923   3011031   0.41
1   20180209  2000711   3011042   0.54
2   20180209  2002061   3011042   0.81
3   20180209  2000885   3011043   0.35
4   20180209  2002447   3011044   0.48
5   20180209  2000567   3011046   0.07
6   20180209  2000069   3011047   2.86
7  

20170928 finished
0:00:31.016095
20170927 finished
0:00:29.680403
20170926 finished
0:00:36.138074
20170925 finished
0:00:32.495696
       date     skey  index_id weight
0  20170922  2002259   3011041   0.48
20170922 finished
0:00:30.339477
       date     skey  index_id weight
0  20170921  2002259   3011041   0.47
20170921 finished
0:00:36.049720
       date     skey  index_id weight
0  20170920  2002259   3011041   0.48
20170920 finished
0:00:30.732923
20170919 finished
0:00:33.502795
20170918 finished
0:00:34.426776
        date     skey  index_id weight
0   20170915  2000593   3011041    0.3
1   20170915  2000040   3011042   0.73
2   20170915  2002374   3011042    0.3
3   20170915  1600970   3011042   1.15
4   20170915  2002120   3011043   0.74
5   20170915  2002468   3011043    1.2
6   20170915  2000676   3011044   0.31
7   20170915  2000835   3011044   0.25
8   20170915  2002137   3011044   0.16
9   20170915  2002359   3011044   0.77
10  20170915  2002425   3011044   0.25
11  201

In [107]:
k1 = read_daily(db1, 'index_memb', 20170929, 20170929)
k1[k1['index_id'] > 3000000].shape[0]
k1 = k1[k1['index_id'] > 3000000]
k1

Unnamed: 0,date,skey,index_id,index_name,weight
0,20170929,1600097,3011030,AMAC 农林,0.98
1,20170929,1600108,3011030,AMAC 农林,3.38
2,20170929,1600257,3011030,AMAC 农林,1.14
6741,20170929,1600265,3011030,AMAC 农林,0.00
3,20170929,1600313,3011030,AMAC 农林,1.19
...,...,...,...,...,...
3771,20170929,2300648,3030067,AMAC 仪表,1.56
3772,20170929,2300667,3030067,AMAC 仪表,0.00
3773,20170929,2300720,3030067,AMAC 仪表,0.00
3774,20170929,2300800,3030067,AMAC 仪表,0.00


In [108]:
k2 = read_daily(db1, 'index_memb', 20170928, 20170928)
k2[k2['index_id'] > 3000000].shape[0]
k2 = k2[k2['index_id'] > 3000000]

In [110]:
re = pd.merge(k1, k2, on=['index_id', 'skey'], how='outer')
re[((re['weight_x'] == 0) & (re['weight_y'] != 0)) |
  ((re['weight_y'] == 0) & (re['weight_x'] != 0)) ]

Unnamed: 0,date_x,skey,index_id,index_name_x,weight_x,date_y,index_name_y,weight_y
228,20170929,2002893,3011041,AMAC 公用,0.15,20170928,AMAC 公用,0.0
1722,20170929,2002899,3030051,AMAC 文教,4.83,20170928,AMAC 文教,0.0
1759,20170929,1600315,3030053,AMAC 化学,0.0,20170928,AMAC 化学,1.98
2380,20170929,2002392,3030057,AMAC 矿物,0.0,20170928,AMAC 矿物,0.9
2441,20170929,1600110,3030059,AMAC 有色,2.79,20170928,AMAC 有色,0.0
2612,20170929,1603321,3030061,AMAC 通用,0.46,20170928,AMAC 通用,0.0
2685,20170929,2300091,3030061,AMAC 通用,0.0,20170928,AMAC 通用,1.43
3195,20170929,1603861,3030065,AMAC 电气,0.13,20170928,AMAC 电气,0.0
3209,20170929,2000633,3030065,AMAC 电气,0.19,20170928,AMAC 电气,0.0
3712,20170929,2300701,3030066,AMAC 电子,0.06,20170928,AMAC 电子,0.0


In [112]:
re[re['weight_x'] != re['weight_y']]

Unnamed: 0,date_x,skey,index_id,index_name_x,weight_x,date_y,index_name_y,weight_y
1,20170929,1600108,3011030,AMAC 农林,3.38,20170928,AMAC 农林,3.43
2,20170929,1600257,3011030,AMAC 农林,1.14,20170928,AMAC 农林,1.15
5,20170929,1600354,3011030,AMAC 农林,1.44,20170928,AMAC 农林,1.45
8,20170929,1600467,3011030,AMAC 农林,1.43,20170928,AMAC 农林,1.44
12,20170929,1600965,3011030,AMAC 农林,1.92,20170928,AMAC 农林,1.78
...,...,...,...,...,...,...,...,...
3784,20170929,2300515,3030067,AMAC 仪表,0.96,20170928,AMAC 仪表,0.94
3786,20170929,2300557,3030067,AMAC 仪表,0.85,20170928,AMAC 仪表,0.87
3787,20170929,2300567,3030067,AMAC 仪表,2.58,20170928,AMAC 仪表,2.55
3788,20170929,2300572,3030067,AMAC 仪表,1.09,20170928,AMAC 仪表,1.10


In [94]:
kk = read_daily(db1, 'index_memb', 20200102, 20200911, index_id = [3011030])
date_list = np.sort(kk['date'].unique())
db1 = DB("192.168.10.178", database_name, user, password)
for i in date_list[:1]:
    test = read_daily(db1, 'index_memb', int(i), int(i))
    test = test[test['index_id'] > 3000000]
    df = test.groupby(['index_id'])['weight'].sum().reset_index()
    df['date'] = i
for i in date_list[2:]:
    test = read_daily(db1, 'index_memb', int(i), int(i))
    test = test[test['index_id'] > 3000000]
    df1 = test.groupby(['index_id'])['weight'].sum().reset_index()
    df1['date'] = i
    df = pd.concat([df, df1], axis=0)

In [97]:
df['weight'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])

count    7267.000000
mean       99.999162
std         0.025326
min        99.850000
10%        99.970000
20%        99.980000
30%        99.990000
40%       100.000000
50%       100.000000
60%       100.000000
70%       100.010000
80%       100.020000
90%       100.030000
95%       100.040000
99%       100.070000
max       100.130000
Name: weight, dtype: float64

In [98]:
df['weight_diff'] = (df['weight'] - 100).apply(lambda x: abs(x))
df['weight_diff'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])

count    7.267000e+03
mean     1.781340e-02
std      1.802129e-02
min      0.000000e+00
10%      1.421085e-14
20%      4.263256e-14
30%      1.000000e-02
40%      1.000000e-02
50%      1.000000e-02
60%      2.000000e-02
70%      2.000000e-02
80%      3.000000e-02
90%      4.000000e-02
95%      5.000000e-02
99%      8.000000e-02
max      1.500000e-01
Name: weight_diff, dtype: float64