In [39]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    
    return query

# def write_data(db, name, df):
#     collection = db[name]
#     # drop all records with same index_id and same time
#     for symbol in df['index_id'].unique():
#         if symbol in collection.distinct('index_id'):
#             start_date = df[df['index_id'] == symbol]['date'].min()
#             end_date = df[df['index_id'] == symbol]['date'].max()
#             query = build_query(start_date, end_date, index_id=symbol)
#             collection.delete_many(query)
#     df = df.to_dict('records')
#     collection.insert_many(df) 

def write_data(db, name, df):
    collection = db[name]
    # drop all records with same index_id and same time
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['max_value'].values[0]
#             m_ax = pd.DataFrame.from_records(collection.aggregate([{"$group":{'_id': 'max','max_value':{"$max":"$date"}}}]))['max_value'].values[0]
            df = df[(df['index_id'] == symbol) & (df['date'] > m_ax)]
    df = df.to_dict('records')
    collection.insert_many(df) 
    
def write_weight_data(db, name, df, index_id):
    collection = db[name]
    df = df[df['index_id'] == index_id]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        weight = sub_df['weight'].values[0]
        print(weight)
        collection.update({'skey': skey, 'date':date, 'index_id':index_id}, {'$set':{'weight':float(weight)}})

def delete_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
 
def read_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    




database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)
# 1. read data
kk = read_daily(db1, 'index_memb', 20200813, 20200813)
# kk = read_daily(db1, 'index_memb', 20170502, 20170503, index_id=[1000300])

# 2. uploda data
# write_data(db1, 'index_memb', IF[IF['date'].isin([20170502])])

# 3. delete data
# delete_data(db1, 'index_memb', start_date=20170502, end_date=20170502)

# 4. update weight column
# write_weight_data(db1, 'index_memb', test, 1000300)

In [40]:
delete_data(db1, 'index_memb', start_date=20170502, end_date=20200827, index_id=[1000300, 1000905])

In [41]:
IF = pd.read_pickle(r'A:\index\weight_table_IF.pkl')
IC = pd.read_pickle(r'A:\index\weight_table_IC.pkl')
IF['date'] = IF['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
IF['ID'] = np.where(IF['ID'].str[:2] =='SZ', IF['ID'].str[2:].astype(int) + 2000000, IF['ID'].str[2:].astype(int) + 1000000)
IC['date'] = IC['date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
IC['ID'] = np.where(IC['ID'].str[:2] =='SZ', IC['ID'].str[2:].astype(int) + 2000000, IC['ID'].str[2:].astype(int) + 1000000)
IC = IC.rename(columns={'ID':'skey'})
IF = IF.rename(columns={'ID':'skey'})
IC['index_id'] = 1000905
IF['index_id'] = 1000300
IC['index_name'] = 'IC'
IF['index_name'] = 'IF'
IC = IC.sort_values(by=['date', 'skey'])
IF = IF.sort_values(by=['date', 'skey'])
write_data(db1, 'index_memb', IC)
write_data(db1, 'index_memb', IF)

In [52]:
# AMAC tick data (update weekly in the future)：
import TSLPy3
import datetime
import pandas as pd
import numpy as np
def updateAMAC(updateList, startDate, endDate):
    for num in range(len(updateList)):
       stock=updateList[num]
       tickname = 'Tick_'+ stock
       if num%10 == 0: print('Processing ' + str(num)+' AMAC '+stock)
       tsstr="""
               BegT :=%s;
               EndT :=%s + 0.99;
               setSysParam(pn_stock(),'%s');
               returnData := select ['date'],['close'],['sectional_open'],['sectional_vol'],['sectional_amount']
                             from tradetable datekey BegT to EndT of DefaultStockID() end;
               return returnData;
               """%(startDate,endDate,stock)
       Tick_Stock = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,{})[1])
       Tick_Stock.columns = list(pd.Series(Tick_Stock.columns).str.decode('GBK'))
       Tick_Stock['intdate'] = Tick_Stock.date.astype(int)
       Tick_Stock['time'] = Tick_Stock.date.map(lambda x: datetime.datetime.utcfromtimestamp(round((x - 25569) * 86400.0)))
       Tick_Stock['adjTime'] = Tick_Stock.date.map(lambda x: datetime.datetime.utcfromtimestamp(round((x - 25569) * 86400.0) - 1))
       Tick_Stock['minute'] = Tick_Stock.adjTime.map(lambda x: (x.hour*60 + x.minute + 1))
       assert (Tick_Stock.minute.max() >= 900) & (Tick_Stock.minute.min() <= 570)
       Tick_Stock['morning'] = np.where(Tick_Stock.minute <= 690, 1, 0)          
       Tick_Stock.rename(columns = {'sectional_open':'industry_open','sectional_vol':'cum_volume','sectional_amount':'cum_amount'}, inplace=True)            
       Tick_Stock = Tick_Stock[['intdate','minute','morning','time','close','industry_open','cum_volume','cum_amount']].reset_index(drop = True)
       Tick_Stock['ID'] = stock
       ## ordering per day per stock
       for intD in Tick_Stock.intdate.unique():
           Tick_Stock.loc[Tick_Stock.intdate == intD, 'ordering'] = range(0, len(Tick_Stock.loc[Tick_Stock.intdate == intD, 'ID']))
       Tick_Stock['month'] = Tick_Stock.time.dt.month + Tick_Stock.time.dt.year * 100
       return Tick_Stock
#        assert (datetime.datetime(1899,12,30) + datetime.timedelta(int(Tick_Stock.intdate.max()))).strftime('%Y%m%d') == UPDATEDATE
data = updateAMAC(['CSIH30053'], '20200103T', '20200103T')
data

Processing 0 AMAC CSIH30053


Unnamed: 0,intdate,minute,morning,time,close,industry_open,cum_volume,cum_amount,ID,ordering,month
0,43833,541,1,2020-01-03 09:00:01,2079.6519,2082.8988,0.000000e+00,0.000000e+00,CSIH30053,0,202001
1,43833,566,1,2020-01-03 09:25:06,2081.2721,2082.8988,1.555597e+07,1.779575e+04,CSIH30053,1,202001
2,43833,566,1,2020-01-03 09:25:11,2082.8919,2082.8988,2.086762e+07,2.445689e+04,CSIH30053,2,202001
3,43833,566,1,2020-01-03 09:25:21,2082.8988,2082.8988,2.114212e+07,2.464953e+04,CSIH30053,3,202001
4,43833,571,1,2020-01-03 09:30:01,2082.8321,2082.8988,2.155072e+07,2.497494e+04,CSIH30053,4,202001
...,...,...,...,...,...,...,...,...,...,...,...
2851,43833,898,0,2020-01-03 14:57:05,2090.5493,2082.8988,3.122167e+09,3.088311e+06,CSIH30053,2851,202001
2852,43833,898,0,2020-01-03 14:57:10,2090.6369,2082.8988,3.122317e+09,3.088402e+06,CSIH30053,2852,202001
2853,43833,901,0,2020-01-03 15:00:05,2090.7631,2082.8988,3.148050e+09,3.113167e+06,CSIH30053,2853,202001
2854,43833,901,0,2020-01-03 15:00:10,2091.4399,2082.8988,3.165474e+09,3.126518e+06,CSIH30053,2854,202001


In [37]:
# index weight:
indexCode = 'SH000300'
startDate = '20180101'
endDate = '20180120'
tsstr = """
           indexTicker:= '{}';
           BegT:= {};
           EndT:= {} + 0.99;
           dateArr:=MarketTradeDayQk(BegT,EndT);
           r:=array();
           for nI:=0 to length(dateArr)-1 do
           begin
             GetBKWeightByDate(indexTicker,dateArr[nI],t);
             t := t[:,array("截止日","代码","比例(%)")]; 
             r:=r union t;
           end;
           return r;  
        """.format(indexCode, startDate + 'T', endDate + 'T')
weight_table = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])
weight_table.columns=['date','weight','ID']
weight_table['ID'] = weight_table['ID'].str.decode('GBK')
weight_table['date'] = pd.to_datetime(weight_table.date.astype(str))
weight_table

Unnamed: 0,date,weight,ID
0,2018-01-02,6.608433,SH601318
1,2018-01-02,2.972627,SH600519
2,2018-01-02,2.567800,SH600036
3,2018-01-02,2.152648,SZ000333
4,2018-01-02,1.825902,SZ000651
...,...,...,...
4195,2018-01-19,0.032009,SH600390
4196,2018-01-19,0.031682,SH603858
4197,2018-01-19,0.023580,SZ002839
4198,2018-01-19,0.021894,SZ002841


In [None]:
for day in weight_table.date.unique():
    #day = '2019-07-28'
    IC_stock = list(IC_weight[IC_weight.date == day].ID.unique())
    IF_stock = list(IF_weight[IF_weight.date == day].ID.unique())
    CSI1000_stock = list(CSI1000_weight[CSI1000_weight.date == day].ID.unique())
    ex_stock = list(set(IC_stock + IF_stock + CSI1000_stock))
    assert len(ex_stock) == 1800
    CSIRest_weight_day = weight_table[(weight_table.date == day) & (~weight_table.ID.isin(ex_stock))]
    CSIRest_weight = CSIRest_weight.append(CSIRest_weight_day, ignore_index = True)
sumWeightToday = CSIRest_weight.groupby('date')['weight'].sum().reset_index()
sumWeightToday.rename(columns = {'weight':'sumWeightDay'}, inplace = True)
weight_table = CSIRest_weight.merge(sumWeightToday, on = 'date', how = 'left')
weight_table['weight'] = weight_table['weight'] / weight_table['sumWeightDay'] * 100
weight_table = weight_table.drop(columns = {'sumWeightDay'})

In [None]:
import os
import sys 
sys.path.append("D:/program files/Tinysoft/Analyse.NET")# 
import TSLPy3 #导入模块 
import datetime
import pandas as pd
pd.set_option('display.max_columns',100)
pd.options.mode.chained_assignment = None
from pandas.tseries.offsets import BDay
import numpy as np
perc = [.01, .05, .1, .25, .5, .75, .9, .95, .99]

TodayDate = '20200903'

AMACPath = r'D:\shareWithBenny\AMAC Industry Weight'
YesterdayDate = (datetime.datetime.now() - BDay(5)).strftime('%Y%m%d')
tomorrow = (datetime.datetime.now() + BDay(2)).strftime('%Y-%m-%d')
#!!!ATTTENTION: temp change here
#tomorrow = '2019-06-10'
#YesterdayDate = '20200618'
assert datetime.datetime.now().strftime('%Y%m%d') == TodayDate


## load today AMAC weight table
todayAMACRaw = pd.read_excel(AMACPath + '\\AMACIndustryWeight_' + TodayDate + '.xls')
todayAMAC = todayAMACRaw[['日期\nDate', '指数代码\nIndex Code','成分券代码\nConstituent Code','交易所\nExchange', '权重(%)\nWeight(%)']]
todayAMAC.rename(columns = {'日期\nDate':'date', '指数代码\nIndex Code':'AMACCode','成分券代码\nConstituent Code':'StockID',\
                            '交易所\nExchange':'exchange', '权重(%)\nWeight(%)':'weight'}, inplace = True)
assert todayAMAC.date.unique()[0] == datetime.datetime.now().strftime('%Y-%m-%d')
todayAMAC['StockID'] = todayAMAC['StockID'].map(lambda x:str(int(x)).zfill(6))
todayAMAC['exchange'] = np.where(todayAMAC.exchange == 'Shenzhen', 'SZ', np.where(todayAMAC.exchange == 'Shanghai', 'SH', np.nan))
assert len(todayAMAC['exchange'].unique()) == 2
todayAMAC['ID'] = todayAMAC['exchange'] + todayAMAC['StockID']
assert len(todayAMAC['ID'].unique()) == todayAMAC.shape[0]

assert abs(todayAMAC.groupby('AMACCode')['weight'].sum() - 100).max() < 0.2
sumWeightToday = todayAMAC.groupby('AMACCode')['weight'].sum().reset_index()
sumWeightToday.rename(columns = {'weight':'sumWeightIndex'}, inplace = True)
todayAMAC = todayAMAC.merge(sumWeightToday, on = 'AMACCode', how = 'left')
todayAMAC['weightNormalized'] = todayAMAC['weight'] / todayAMAC['sumWeightIndex']
todayAMAC[['date','AMACCode','ID','weight','weightNormalized']].to_csv(AMACPath + '\\AMACNormalizedWeight_' + TodayDate + '.csv', index = False)

## load yesterday AMAC weight table
yesterdayAMACRaw = pd.read_excel(AMACPath + '\\AMACIndustryWeight_' + YesterdayDate + '.xls')
yesterdayAMAC = yesterdayAMACRaw[['日期\nDate', '指数代码\nIndex Code','成分券代码\nConstituent Code','交易所\nExchange', '权重(%)\nWeight(%)']]
yesterdayAMAC.rename(columns = {'日期\nDate':'date', '指数代码\nIndex Code':'AMACCode','成分券代码\nConstituent Code':'StockID',\
                            '交易所\nExchange':'exchange', '权重(%)\nWeight(%)':'weight'}, inplace = True)
yesterdayAMAC['StockID'] = yesterdayAMAC['StockID'].map(lambda x:str(int(x)).zfill(6))
yesterdayAMAC['exchange'] = np.where(yesterdayAMAC.exchange == 'Shenzhen', 'SZ', np.where(yesterdayAMAC.exchange == 'Shanghai', 'SH', np.nan))
assert len(yesterdayAMAC['exchange'].unique()) == 2
yesterdayAMAC['ID'] = yesterdayAMAC['exchange'] + yesterdayAMAC['StockID']
assert abs(yesterdayAMAC.groupby('AMACCode')['weight'].sum() - 100).max() < 0.2

## make sure two days have same number of AMAC industry
assert todayAMAC.AMACCode.nunique() == yesterdayAMAC.AMACCode.nunique() 
assert todayAMAC.AMACCode.nunique() == 43
## produce changes list
changeList = []
for industry in todayAMAC.AMACCode.unique():
    induToday = todayAMAC[todayAMAC.AMACCode == industry]
    induYesterday = yesterdayAMAC[yesterdayAMAC.AMACCode == industry]
    weightChangePerStock = induToday[['ID','weight']].merge(induYesterday[['ID','weight']], on = 'ID', how = 'inner')
    weightChangePerStock['weightDiff'] = (weightChangePerStock['weight_x'] / weightChangePerStock['weight_y'] - 1).abs()
    weightChangePerStock['absWeightDiff'] = (weightChangePerStock['weight_x'] - weightChangePerStock['weight_y']).abs()
    ## make the sure the change is less than 15% when the weight is bigger than 0.06
    if industry not in []: 
       assert (weightChangePerStock['weightDiff'].max() < 0.75) | (weightChangePerStock['weight_y'].min() < 0.05)
    todayAddList = list(set(induToday.ID.unique()).difference(induYesterday.ID.unique()))
    todayDelList = list(set(induYesterday.ID.unique()).difference(induToday.ID.unique()))
    todayAddListPerc = round((len(todayAddList) / len(induYesterday.ID.unique())),4)
    todayDelListPerc = round((len(todayDelList) / len(induYesterday.ID.unique())),4)
    todayAddListWeights = (induToday[induToday.ID.isin(todayAddList)].weight.sum() / 100).round(4)
    todayDelListWeights = (induYesterday[induYesterday.ID.isin(todayDelList)].weight.sum() / 100).round(4)
    
    induChange = [TodayDate, industry, len(induToday.ID.unique()), len(induYesterday.ID.unique()), weightChangePerStock['absWeightDiff'].sum() / 100, todayAddListPerc, todayDelListPerc, todayAddListWeights, todayDelListWeights]
    changeList += [induChange]
changeTable = pd.DataFrame(changeList, columns = ['date','AMACCode','todayN','yesterdayN','totalAbsWeightChange','addPerc','delPerc','addWeights','delWeights'])   
## sum of weight is 1.
print('absolute value of the total change in weight: ',round(changeTable['totalAbsWeightChange'].mean(),4))
print('    sum of the weight for deleted members: ',round(changeTable.loc[changeTable['delWeights'] > 0,'delWeights'].mean(),4))
print('     sum of the weight for added members: ',round(changeTable.loc[changeTable['addWeights'] > 0,'addWeights'].mean(),4))
print('number of indices which has deleted members: ',changeTable[(changeTable.delPerc > 0)].shape[0])
print(' number of indices which has added members: ',changeTable[(changeTable.addPerc > 0)].shape[0])
changeTable_L1 =  pd.read_csv(AMACPath + '\\memberChangeInfo_' + YesterdayDate + '.csv')
changeTable = changeTable.append(changeTable_L1, ignore_index = True)
changeTable.to_csv(AMACPath + '\\memberChangeInfo_' + TodayDate + '.csv', index = False)    
# =============================================================================
# =============================================================================
    
# ============================================================================= 
# =============================================================================   
## STEP 1: check if AMAC changes for all historical members
memberList = [] 
for bench in ['IC','IF','CSI1000','CSIRest']:
    indexPath = r'D:\\shareWithBenny\\indexInfo\\' + bench
    weightDf = pd.read_pickle(indexPath + '\\weight_table_' + bench + '.pkl')
#    newStockTrade = sorted(list(pd.read_csv(indexPath + '\\curStockList_' + bench + '.csv')['StockID']))
#    memberList += sorted(list(set(list(weightDf[weightDf.date >= '2017-09-01'].ID.unique()) + newStockTrade)))
    memberList += list(weightDf[weightDf.date >= '2017-09-01'].ID.unique())
memberList = sorted(list(set(memberList)))
tsstr = """
            StockArr := array(%s);
            r := array();
            for nI:=0 to length(StockArr)-1 do begin
                    setSysParam(pn_stock(),StockArr[nI]);
                    r[nI]['StockID'] := StockArr[nI];
                    r[nI]['ZJH2'] := base(10039);
            end;
            return r;"""%(str(memberList)[1:-1])
curAMACDict = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])  
curAMACDict.columns = ['ID','ZJH2']
curAMACDict['ID'] = curAMACDict['ID'].str.decode('GBK')
curAMACDict['ZJH2'] = curAMACDict['ZJH2'].str.decode('GBK')
curAMACDict['ZJH2'] = curAMACDict['ZJH2'].astype(int)
curAMACDict['AMACCode'] = np.where(curAMACDict.ZJH2 <= 5, 'CSIH11030',np.where(curAMACDict.ZJH2 <= 12, 'CSIH11031', \
                   np.where(curAMACDict.ZJH2 == 13, 'CSIH30041',np.where(curAMACDict.ZJH2 == 14, 'CSIH30042', \
                   np.where(curAMACDict.ZJH2 == 15, 'CSIH30043',np.where(curAMACDict.ZJH2 == 17, 'CSIH30044',\
                   np.where(curAMACDict.ZJH2 == 18, 'CSIH30045',np.where(curAMACDict.ZJH2 == 19, 'CSIH30046',\
                   np.where(curAMACDict.ZJH2 == 20, 'CSIH30047',np.where(curAMACDict.ZJH2 == 21, 'CSIH30048',\
                   np.where(curAMACDict.ZJH2 == 22, 'CSIH30049',np.where(curAMACDict.ZJH2 == 23, 'CSIH30050',\
                   np.where(curAMACDict.ZJH2 == 24, 'CSIH30051',np.where(curAMACDict.ZJH2 == 25, 'CSIH30052',\
                   np.where(curAMACDict.ZJH2 == 26, 'CSIH30053',np.where(curAMACDict.ZJH2 == 27, 'CSIH30054',\
                   np.where(curAMACDict.ZJH2 == 28, 'CSIH30055',np.where(curAMACDict.ZJH2 == 29, 'CSIH30056',\
                   np.where(curAMACDict.ZJH2 == 30, 'CSIH30057',np.where(curAMACDict.ZJH2 == 31, 'CSIH30058',\
                   np.where(curAMACDict.ZJH2 == 32, 'CSIH30059',np.where(curAMACDict.ZJH2 == 33, 'CSIH30060',\
                   np.where(curAMACDict.ZJH2 == 34, 'CSIH30061',np.where(curAMACDict.ZJH2 == 35, 'CSIH30062',\
                   np.where(curAMACDict.ZJH2 == 36, 'CSIH30063',np.where(curAMACDict.ZJH2 == 37, 'CSIH30064',\
                   np.where(curAMACDict.ZJH2 == 38, 'CSIH30065',np.where(curAMACDict.ZJH2 == 39, 'CSIH30066',\
                   np.where(curAMACDict.ZJH2 == 40, 'CSIH30067',np.where(curAMACDict.ZJH2 <= 43, 'CSIH11050',\
                   np.where(curAMACDict.ZJH2 <= 46, 'CSIH11041',np.where(curAMACDict.ZJH2 <= 50, 'CSIH11042',\
                   np.where(curAMACDict.ZJH2 <= 52, 'CSIH11045',np.where(curAMACDict.ZJH2 <= 60, 'CSIH11043',\
                   np.where(curAMACDict.ZJH2 <= 62, 'CSIH30036',np.where(curAMACDict.ZJH2 <= 65, 'CSIH11044',\
                   np.where(curAMACDict.ZJH2 <= 69, 'CSIH11046',np.where(curAMACDict.ZJH2 == 70, 'CSIH11047',\
                   np.where(curAMACDict.ZJH2 <= 72, 'CSIH30037',np.where(curAMACDict.ZJH2 <= 75, 'CSIH30038',\
                   np.where(curAMACDict.ZJH2 <= 78, 'CSIH30039',np.where(curAMACDict.ZJH2 <= 81, 'CSIH30040',\
                   np.where(curAMACDict.ZJH2 == 82, 'CSIH11049',np.where(curAMACDict.ZJH2 <= 84, 'CSIH30040',\
                   np.where(curAMACDict.ZJH2 <= 89, 'CSIH11049',np.where(curAMACDict.ZJH2 == 90, 'CSIH11050',np.nan))))))))))))))))))))))))))))))))))))))))))))))
oldAMACDict = pd.read_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\cur_AMAC_industry.pkl')
oldAMACDict.to_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\cur_AMAC_industry_old.pkl')
compAMACDict = oldAMACDict.merge(curAMACDict, on = ['ID'], how = 'outer', validate = 'one_to_one')
assert (pd.notnull(compAMACDict.AMACCode_x).all() & (pd.notnull(compAMACDict.AMACCode_y).all()))
assert (compAMACDict.AMACCode_x == compAMACDict.AMACCode_y).all()
curAMACDict.to_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\cur_AMAC_industry.pkl')

## STEP 2: update weekly AMAC table for all stocks
histDf = pd.read_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\weekly_AMAC_table.pkl')
histDf.to_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\weekly_AMAC_table_old.pkl')
curDf = todayAMAC[['date','ID','AMACCode']]
curDf['date'] = pd.to_datetime(curDf['date'])
curDf['AMACCode'] = curDf['AMACCode'].map(lambda x : 'CSI' + str(x))
curDf['intdate'] = curDf['date'].apply(lambda x: (x-datetime.datetime(1899, 12, 30)).days)
curDf = curDf[['intdate','AMACCode','ID']]
#memberList = [] 
#for bench in ['IC','IF','CSI1000','CSIRest']:
#    indexPath = r'D:\\shareWithBenny\\indexInfo\\' + bench
#    weightDf = pd.read_pickle(indexPath + '\\weight_table_' + bench + '.pkl')
#    memberList += list(weightDf[weightDf.date >= '2017-09-01'].ID.unique())
#memberList = sorted(list(set(memberList)))
addList = sorted(list(set(memberList).difference(curDf.ID.unique())))
addDf = []
for addS in addList:
#    print('ADDING ', addS)
    addSL = [curDf.intdate.max(), curAMACDict[curAMACDict.ID == addS].AMACCode.values[0], addS]
    addDf += [addSL]
addDf = pd.DataFrame(addDf, columns = ['intdate','AMACCode','ID'])
curDf = curDf.append(addDf).sort_values(['intdate','AMACCode']).reset_index(drop = True)
curDf = histDf.append(curDf).sort_values(['intdate','AMACCode']).reset_index(drop = True)
curDf.to_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\weekly_AMAC_table.pkl')
#curDf.groupby('intdate')['ID'].count().tail()

## STEP 3: update daily AMAC table for all stocks  
curDf = pd.read_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\weekly_AMAC_table.pkl')
todayIntdate = curDf.intdate.max()
assert curDf.groupby(['intdate','ID'])['AMACCode'].count().max() == 1
curDf['date'] = curDf['intdate'].apply(lambda x: datetime.datetime(1899, 12, 30) + datetime.timedelta(x))
curDf['prevWeek'] = curDf['date'].apply(lambda x: x.isocalendar()[0]*100 + x.isocalendar()[1])
curDf = curDf[['prevWeek', 'AMACCode', 'ID']]
### each stock each week should only have one value
assert((curDf.groupby(['prevWeek', 'ID'])['AMACCode'].count() == 1).all())

icData = pd.read_pickle(os.path.join(r'\\192.168.10.217\shareWithBenny\indexInfo\IC', 'Day_Index_IC.pkl'))
assert (datetime.datetime(1899, 12, 30) + datetime.timedelta(int(icData.intdate.max()))).strftime('%Y%m%d') == TodayDate
icData['intdate'] = icData.intdate.astype('int64')
icData = icData[icData.intdate <= todayIntdate]
icData = icData[['intdate']]
icData['date'] = icData['intdate'].apply(lambda x: datetime.datetime(1899, 12, 30) + datetime.timedelta(x))
icData['week'] = icData['date'].apply(lambda x: x.isocalendar()[0]*100 + x.isocalendar()[1])
icData['hasThu'] = np.where(icData.date.dt.weekday == 3, 1, 0)
prevWeek = icData.groupby(['week'])['hasThu'].max().reset_index()
prevWeek['prevHasThu'] = prevWeek['hasThu'].shift(1)
prevWeek['prevWeek'] = prevWeek['week'].shift(1)
prevWeek['prevWeek'] = np.where(prevWeek['prevHasThu'] == 1, prevWeek['prevWeek'], np.nan)
prevWeek['prevWeek'] = prevWeek['prevWeek'].ffill()
prevWeek = prevWeek[['week', 'prevWeek']]

icData = icData[['intdate', 'week']]
fullData = icData.append([icData]*(curDf['ID'].nunique() - 1)).sort_values(by=['intdate'])
fullData['ID'] = curDf['ID'].unique().tolist()*fullData['intdate'].nunique()
fullData = pd.merge(fullData, prevWeek, how='left', on=['week'], validate='many_to_one')
fullData = pd.merge(fullData, curDf, how='left', on=['prevWeek', 'ID'], validate='many_to_one')
fullData['AMACCode'] = fullData.groupby(['ID'])['AMACCode'].ffill()
fullData['AMACCode'] = fullData.groupby(['ID'])['AMACCode'].bfill()
## add new coming
for miss in list(set(curDf.ID.unique()).difference(fullData[pd.notnull(fullData['AMACCode'])].ID.unique())):
    print('missing stock in daily AMAC', miss)
    miss_temp = fullData[fullData.ID == 'SH600000'].reset_index(drop = True)
    miss_temp['ID'] = miss
    miss_temp['AMACCode'] = curDf[curDf.ID == miss].AMACCode.unique()[-1]
    fullData = fullData.append(miss_temp)
# keep dates after 20170901 (intdate 42979)
fullData = fullData.loc[(fullData['intdate'] >= 42979) & (~fullData['AMACCode'].isnull()),
                        ['intdate', 'AMACCode', 'ID']].sort_values(by=['intdate', 'AMACCode', 'ID']).reset_index(drop=True)
savePath = r'\\192.168.10.217\shareWithBenny\indexInfo\AMAC'
fullData.to_pickle(os.path.join(savePath, 'daily_AMAC_table.pkl'))  
mysqlPath = r'G:\data\rch\raw\secData_TR\AMAC_member'
fullData.to_csv(os.path.join(mysqlPath, 'daily_AMAC_table.csv.gz'), index = False, compression = 'gzip')  

# =============================================================================    
# =============================================================================    
AMACList = list(todayAMAC.AMACCode.unique())
File=open('D:\shareWithBenny\generateCPPCode\\industryIndexWeights_' + TodayDate +'.txt',"w")
#File=open('D:\Dropbox (study-int)\Andy Yin\generateCPPCode\\test_industryIndexWeights__' + saveDate +'.txt',"w")
File.write("    // as of productionDate " + tomorrow + "\n")
File.write('\n')
for industry in AMACList:
    #industry = AMACList[0]
    secidAMAC = '30' + industry[1:]
    if AMACList.index(industry) == 0: 
           File.write("    if (_indexCode == " + str(secidAMAC) + ')' + "\n")
    else: File.write("    else if (_indexCode == " + str(secidAMAC) + ')' + "\n")
    File.write("    {" + "\n")
    
    stockList = list(todayAMAC[todayAMAC.AMACCode == industry].ID.unique())
    for stock in stockList:
        #stock = stockList[0]
        print ('Processing ', industry, stock)
        if stock[:2] == 'SH': secidStock = '1' + stock[2:]
        else:  secidStock = '2' + stock[2:]
        W = round(todayAMAC.loc[(todayAMAC.AMACCode == industry) & (todayAMAC.ID == stock),'weightNormalized'].values[0], 6)
        
        File.write("        this->indexMembers" + "[" + secidStock + "] = "  + 'new IndexMember(' + secidStock + ', ' + str(W) + ', this' + ');' + "\n")        
    File.write("    }" + "\n")

File.close()
    
        



























#
#
##
### ============================================================================= 
#for bench in ['IC','IF','CSI1000','CSIRest']:
#    indexPath = r'D:\\shareWithBenny\\indexInfo\\' + bench
#    AMACTable = pd.read_pickle(indexPath +'\\' + bench + '_AMAC_industry.pkl')    
#    AMACTable['indexCat'] = bench
#    AMAC_comp = todayAMAC[['date','ID','AMACCode']].merge(AMACTable[['ID','industry','indexCat']], on = 'ID', how = 'right')
#    AMAC_comp.rename(columns = {'AMACCode':'AMAC_CSI','industry':'AMAC_TR'}, inplace = True)
#    AMAC_comp['AMAC_CSI'] = AMAC_comp['AMAC_CSI'].map(lambda x : 'CSI' + str(x))
#    
#    AMAC_comp = AMAC_comp[pd.notnull(AMAC_comp.date)]
#    assert (AMAC_comp.AMAC_CSI == AMAC_comp.AMAC_TR).all()
#    
## =============================================================================    
#
#
#
### STEP 3: update daily AMAC table for all stocks    
#curDf = pd.read_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\weekly_AMAC_table.pkl')
#assert curDf.groupby(['intdate','ID'])['AMACCode'].count().max() == 1
#icData = pd.read_pickle(os.path.join(r'\\192.168.10.217\shareWithBenny\indexInfo\IC', 'Day_Index_IC.pkl'))
##icData = icData[icData.intdate <= 43748]
#icData['intdate'] = icData.intdate.astype('int64')
#icData = icData[['intdate']]
#fullData = icData.append([icData]*(curDf['ID'].nunique() - 1)).sort_values(by=['intdate'])
#fullData['ID'] = curDf['ID'].unique().tolist()*fullData['intdate'].nunique()
#fullData = pd.merge(fullData, curDf, how='left', on=['intdate', 'ID'], validate='one_to_one')
#fullData['AMACCode'] = fullData.groupby(['ID'])['AMACCode'].ffill()
#fullData['AMACCode'] = fullData.groupby(['ID'])['AMACCode'].bfill()
#fullData = fullData.sort_values(by=['intdate', 'AMACCode', 'ID'])
## keep dates after 20170901 (intdate 42979)
#fullData = fullData[(fullData['intdate'] >= 42979) & (~fullData['AMACCode'].isnull())].reset_index(drop=True)
#fullData = fullData[['intdate', 'AMACCode', 'ID']]
##savePath = r'\\192.168.10.217\shareWithBenny\indexInfo\AMAC'
##fullData.to_pickle(os.path.join(savePath, 'daily_AMAC_table_20191107v0.pkl'))  
#
#
#['SH603815', 'SZ300795', 'SZ300793']
#histDf = pd.read_pickle(r'D:\\shareWithBenny\\indexInfo\\AMAC\\daily_AMAC_table.pkl')
#com_Df = fullData.merge(histDf, on = ['intdate', 'ID'], how = 'left', validate='one_to_one')
#temp1 = com_Df[pd.isnull(com_Df.AMACCode_x) & pd.notnull(com_Df.AMACCode_y)]
#assert len(temp1) == 0
#temp2 = com_Df[pd.isnull(com_Df.AMACCode_y) & pd.notnull(com_Df.AMACCode_x)]
#newList = temp2.ID.unique()
#temp3 = com_Df[(com_Df.AMACCode_y != com_Df.AMACCode_x) & (~com_Df.ID.isin(newList))].sort_values('intdate').reset_index(drop = True)
#temp3['date'] = temp3['intdate'].apply(lambda x: datetime.datetime(1899, 12, 30) + datetime.timedelta(x))
#temp3['weekday'] = temp3['date'].dt.weekday
#temp3['chgCount'] = temp3.groupby('ID')['date'].transform('count')
#temp3 = temp3.sort_values(by = ['chgCount','ID'], ascending = [False,True]).reset_index(drop=True)
##temp3.shape
##Out[87]: (326, 7)