In [1]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def tickDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


def dailyDB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def read_filter_daily(db, name, start_date=None, end_date=None, skey=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date','skey'])
    return df  

def read_memb_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'index_id'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    


def read_stock_daily(db, name, start_date=None, end_date=None, skey=None, index_name=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_name is not None:
        query['index_name'] = {'$in': index_name}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'skey'])
    return df    


def read_beta_daily(db, name, start_date=None, end_date=None, skey=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date','skey'])
    return df  


database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'

import sys
import zipfile 

pd.set_option('max_columns', 200)
db1 = tickDB("192.168.10.178", database_name, user, password)
db = dailyDB("192.168.10.178", database_name, user, password)
dateLs = read_memb_daily(db, 'index_memb', index_id=[1000300], start_date=20201228, end_date=20201229)['date'].unique()
for date in dateLs:
    data1 = read_memb_daily(db, 'index_memb', start_date=int(date), end_date=int(date))
    data1 = data1[~data1['index_id'].isin([1000300, 1000852, 1000905, 1000985])]
    f = zipfile.ZipFile('E:\\AMAC\\AMAC_' + str(date) + '.zip', 'r')
    data2 = pd.read_excel(f.open(f.namelist()[0]))
    data2['index_name'] = data2['指数名称\nIndex Name'].str[:4] + ' ' + data2['指数名称\nIndex Name'].str[4:]
    re2 = data2.groupby('index_name')['权重(%)\nWeight(%)'].sum().reset_index()
    re1 = data1.groupby('index_name')['weight'].sum().reset_index()
    re2['权重(%)\nWeight(%)'] = re2['权重(%)\nWeight(%)'].round(2)
    re1['weight'] = re1['weight'].round(2)
    re = pd.merge(re1, re2, on='index_name', how='outer')
    if re[re['weight'] != re['权重(%)\nWeight(%)']].shape[0] != 0:
        print(date)
        display(re[re['weight'] != re['权重(%)\nWeight(%)']])

In [19]:
import numpy as np
len(data1[data1['index_name'] == 'AMAC 地产']['成分券代码\nConstituent Code'].unique())

122

In [22]:
set(data2[data2['index_name'] == 'AMAC 地产']['成分券代码\nConstituent Code'].unique()) - set(data1[(data1['index_name'] == 'AMAC 地产') & (data1['weight'] != 0)]['成分券代码\nConstituent Code'].unique())

{558}

In [23]:
data2[data2['成分券代码\nConstituent Code'] == 558]

Unnamed: 0,日期\nDate,指数代码\nIndex Code,指数名称\nIndex Name,指数英文名称\nIndex Name(Eng.),成分券代码\nConstituent Code,成分券名称\nConstituent Name,成分券英文名称\nConstituent Name(Eng.),交易所\nExchange,权重(%)\nWeight(%),交易货币\nTrading Currency,index_name
958,2020-12-15,H11047,AMAC地产,AMAC Real Estate Index,558,莱茵体育,"LANDER SPORTS DEVELOPMENT CO.,LTD.",Shenzhen,0.2,CNY,AMAC 地产


In [40]:
import numpy as np
pd.set_option('max_columns', 200)
db1 = tickDB("192.168.10.178", database_name, user, password)
db = dailyDB("192.168.10.178", database_name, user, password)
dateLs = read_memb_daily(db, 'index_memb', index_id=[1000300], start_date=20200813, end_date=20201101)['date'].astype(str).\
apply(lambda x: datetime.datetime.strptime(x,"%Y%m%d")).unique()
dateLs1 = [datetime.datetime.fromtimestamp((i-np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')).weekday() for i in dateLs]
dateLs = np.array(dateLs)[np.array(dateLs1) == 3]
dateLs = [int(datetime.datetime.fromtimestamp((i-np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')).strftime('%Y%m%d')) for i in dateLs]
dateLs

  import sys
  if __name__ == '__main__':


[20200813,
 20200820,
 20200827,
 20200903,
 20200910,
 20200917,
 20200924,
 20201015,
 20201022,
 20201029]

In [4]:
import datetime
updateDate = (datetime.date.today()).strftime('%Y%m%d')
savePath = 'E:\\AMAC'
from urllib.request import urlretrieve
url = 'http://www.csindex.com.cn/uploads/indices/amac/files/csrccwf.zip'
fileName = savePath + '\\AMAC_' + updateDate + '.zip'
urlretrieve(url, fileName)

('E:\\AMAC\\AMAC_20201218.zip', <http.client.HTTPMessage at 0x1d1e5654a08>)