In [46]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import numpy as np


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def read_daily(self, table_name, start_date=None, end_date=None, index_id=None, skey=None, interval=None, col=None,
                   return_sdi=True):
        collection = self.db[table_name]
        # Build projection
        prj = {'_id': 0}
        if col is not None:
            if return_sdi:
                col = ['skey', 'date', 'interval'] + col
            for col_name in col:
                prj[col_name] = 1

        # Build query
        query = {}
        if skey is not None:
            query['skey'] = {'$in': skey}
        if index_id is not None:
            query['index_id'] = {'$in': index_id}
        if interval is not None:
            query['interval'] = {'$in': interval}
        if start_date is not None:
            if end_date is not None:
                query['date'] = {'$gte': start_date, '$lte': end_date}
            else:
                query['date'] = {'$gte': start_date}
        elif end_date is not None:
            query['date'] = {'$lte': end_date}

        # Load data
        cur = collection.find(query, prj)
        df = pd.DataFrame.from_records(cur)
        if df.empty:
            df = pd.DataFrame()
        else:
            df = df.sort_values(by=['date', 'index_id', 'skey'])
        return df

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m


patch_pandas_pickle()


def sta_sizeFilter(stockID, startDate, endDate, regWindowSize=20, weekInterval=1):
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    pd.set_option('max_columns', 200)
    db = DB("192.168.10.178", database_name, user, password)

    print(' ...... Now Calculating SizeFilter for  ', stockID)
    #    startTm = datetime.datetime.now()
    stockData = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate, symbol=[stockID])
    stockData = stockData.loc[((stockData.bid1p != 0) | (stockData.ask1p != 0)), \
                              ['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ordering',
                               'cum_amount', 'bid1p', 'bid1q', 'bid5q', 'ask5q']].reset_index(drop=True)
    stockData = stockData[((stockData.time >= 93000000000) & (stockData.time <= 113000000000)) | \
                          ((stockData.time >= 130000000000) & (stockData.time <= 150000000000))].reset_index(drop=True)
    indexDaily = db.read_daily('index_memb', start_date=startDate, end_date=endDate, index_id=[1000905])
    indexDaily['tradeConsDay'] = indexDaily.groupby(['date']).grouper.group_info[0]
    indexDaily = indexDaily.groupby('date')['tradeConsDay'].first().reset_index()
    df_train = stockData.merge(indexDaily[['date', 'tradeConsDay']], how='left', on=['date'], validate='many_to_one')

    df_train = df_train[(df_train['time'] >= 93000000000) & (df_train['time'] < 145655000000)].reset_index(drop=True)
    groupAllData = df_train.groupby(['skey', 'date'])
    df_train['amountThisUpdate'] = df_train.cum_amount - groupAllData['cum_amount'].shift(1)
    df_train['amountThisUpdate'] = np.where(pd.isnull(df_train.amountThisUpdate), df_train.cum_amount,
                                            df_train.amountThisUpdate)

    ### add useful day indicator
    df_train['curNearLimit'] = np.where((df_train.ask5q == 0) | (df_train.bid5q == 0), 1.0, 0.0)
    df_train['curNearLimit_L1'] = groupAllData['curNearLimit'].shift(1)
    df_train['dailyCount'] = groupAllData['time'].transform('count')
    df_train['nearLimitCount'] = groupAllData['curNearLimit'].transform('sum')
    dateInfo = groupAllData['dailyCount', 'nearLimitCount', 'tradeConsDay'].mean().reset_index()
    del groupAllData
    dateInfo['useFlag'] = np.where(dateInfo['nearLimitCount'] * 2 < dateInfo['dailyCount'], 1, 0)
    dateInfo['useConsDay'] = dateInfo['useFlag'].cumsum()
    df_train = pd.merge(df_train, dateInfo[['date', 'tradeConsDay', 'useFlag', 'useConsDay']],
                        how='left', on=['date', 'tradeConsDay'], validate='many_to_one')

    df_train['weekday'] = df_train['datetime'].dt.weekday
    sizeFilterData = df_train.groupby(['date'])['tradeConsDay'].first().reset_index()
    sizeFilterData['amountFilter'] = np.nan
    ## we only update on Thrusday
    regDays = sorted(list(df_train.loc[df_train.weekday == 3, 'tradeConsDay'].unique()))

    weekInterval = 1
    for d in range(int(regWindowSize / 5), len(regDays), weekInterval):
        amountFilter = np.nan
        ## get current Thrusday
        endTradeConsDay = regDays[d]
        endUseConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['useConsDay'].values[0]
        startUseConsDay = max(endUseConsDay - regWindowSize + 1, 1)

        ## check 60 consecutive days
        if dateInfo['useConsDay'].max() < 1:
            amountFilter = np.nan
            continue
        startTradeConsDay = dateInfo[dateInfo['useConsDay'] == startUseConsDay]['tradeConsDay'].values[0]
        endTradeConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['tradeConsDay'].values[0]
        if (endTradeConsDay - startTradeConsDay > 59) or (endUseConsDay - startUseConsDay < 9):
            amountFilter = np.nan
            continue
            ## get the Monday right after current Thursday update
        oss_intdate = df_train.loc[df_train.tradeConsDay == endTradeConsDay, 'date'].unique()[0]
        oss_intdate = (datetime.datetime.strptime(str(oss_intdate), '%Y%m%d') - datetime.datetime(1899, 12,
                                                                                                  30)).days + 4
        oss = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(oss_intdate))).strftime('%Y%m%d'))
        ## get the Friday right after next Thursday update
        if d >= len(regDays) - weekInterval:
            ose = df_train.date.max()
        else:
            ose_intdate = df_train.loc[df_train.tradeConsDay == regDays[d + weekInterval], 'date'].unique()[0]
            ose_intdate = (datetime.datetime.strptime(str(ose_intdate), '%Y%m%d') - datetime.datetime(1899, 12,
                                                                                                      30)).days + 1
            ose = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(ose_intdate))).strftime('%Y%m%d'))
        inSampleSlice = df_train[(df_train.useConsDay >= startUseConsDay) & \
                                 (df_train.useConsDay <= endUseConsDay) & \
                                 (df_train.useFlag == 1)].reset_index(drop=True)
        amountFilter = inSampleSlice[(inSampleSlice['curNearLimit'] == 0) & \
                                     (inSampleSlice['curNearLimit_L1'] == 0)].amountThisUpdate.quantile(.75)
        if ose < oss:
            print('out of sample end day < start day, skip')
            continue
        sizeFilterData.loc[(sizeFilterData.date >= oss) & (sizeFilterData.date <= ose), 'amountFilter'] = amountFilter
    sizeFilterData['skey'] = int(stockID)
    sizeFilterData = sizeFilterData[['skey', 'date', 'amountFilter']]
    if 20200928 in sizeFilterData['date'].values:
        sizeFilterData = sizeFilterData.append(pd.DataFrame([[sizeFilterData['skey'].iloc[0], 20200929, 
                                                              sizeFilterData.loc[sizeFilterData['date'] == 20200928, 'amountFilter'].values[0]]],
                                                           columns=['skey', 'date', 'amountFilter']))
        sizeFilterData = sizeFilterData.append(pd.DataFrame([[sizeFilterData['skey'].iloc[0], 20200930, 
                                                              sizeFilterData.loc[sizeFilterData['date'] == 20200928, 'amountFilter'].values[0]]],
                                                           columns=['skey', 'date', 'amountFilter']))
    
    sizeFilterData['amountFilter'] = sizeFilterData['amountFilter'].fillna(0)
    sizeFilterData = sizeFilterData.rename(columns={'amountFilter':'size_filter'})
    sizeFilterData = sizeFilterData.sort_values(by='date').reset_index(drop=True)
    return sizeFilterData

In [47]:
from WindPy import *
w.start()


import os
import glob
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',200)
pd.options.mode.chained_assignment = None
import numpy as np

database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

il = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
il['StockID'] = il['StockID'].str[3:] + '.CSI'
il = il['StockID'].values

import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import TSLPy3

def updateAShare(date):
    TRDate = str(date)
    tsstr = """
           BegT:=%s;
           EndT:=%s;
           SetSysParam(pn_stock(),'SH000001');
           SetSysParam(PN_Cycle(),cy_day());
           dateArr:=MarketTradeDayQk(BegT,EndT);
           r:=array();
           for nI:=0 to length(dateArr)-1 do
           begin
             echo dateArr[nI];
             t:= getabkbydate('A股',dateArr[nI]);
             r:=r union2 t;
           end;
           r:= select [0] as 'StockID' from `r end;
           r := select * from r order by ['StockID'] end;
           return r;
            """%(TRDate + 'T', TRDate + 'T + 0.99')
    stockList = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,[],{})[1])
    stockList.columns = list(pd.Series(stockList.columns).str.decode('GBK'))
    stockList['StockID'] = stockList['StockID'].str.decode('GBK')
    stockList['skey'] = np.where(stockList['StockID'].str[:2] == 'SH', 1000000 + stockList['StockID'].str[2:].astype(int),
                                2000000 + stockList['StockID'].str[2:].astype(int))
    stockList['date'] = int(TRDate)
    return stockList

dl = [20200928, 20200929, 20200930]
total_stock = []
for d in dl:
    data = updateAShare(d)
    total_stock += [data]
total_stock = pd.concat(total_stock, sort=False)

In [48]:
total_stock['skey'] = np.where(total_stock['StockID'].str[:2] == 'SH', total_stock['StockID'].str[2:].astype(int) + 1000000, total_stock['StockID'].str[2:].astype(int) + 2000000)
stockList = total_stock['skey'].unique()
stockList

array([1600000, 1600004, 1600006, ..., 1605099, 1688093, 2003011],
      dtype=int64)

In [57]:
startDate = 20200501
endDate = 20200928
for i in stockList:
    if i > 2000000:
        print(i)
        try:
            kk = sta_sizeFilter(i,startDate=startDate, endDate=endDate, 
                     regWindowSize = 20, weekInterval = 1)
            kk.to_pickle('L:\\ShareWithServer\\filter1\\' + str(i) + '.pkl')
        except Exception as e:
            print(e)

2000001
 ...... Now Calculating SizeFilter for   2000001
2000002
 ...... Now Calculating SizeFilter for   2000002
2000004
 ...... Now Calculating SizeFilter for   2000004
2000005
 ...... Now Calculating SizeFilter for   2000005
2000006
 ...... Now Calculating SizeFilter for   2000006
2000007
 ...... Now Calculating SizeFilter for   2000007
2000008
 ...... Now Calculating SizeFilter for   2000008
2000009
 ...... Now Calculating SizeFilter for   2000009
2000010
 ...... Now Calculating SizeFilter for   2000010
2000011
 ...... Now Calculating SizeFilter for   2000011
2000012
 ...... Now Calculating SizeFilter for   2000012
2000014
 ...... Now Calculating SizeFilter for   2000014
2000016
 ...... Now Calculating SizeFilter for   2000016
2000017
 ...... Now Calculating SizeFilter for   2000017
2000019
 ...... Now Calculating SizeFilter for   2000019
2000020
 ...... Now Calculating SizeFilter for   2000020
2000021
 ...... Now Calculating SizeFilter for   2000021
2000023
 ...... Now Calculating

2000558
 ...... Now Calculating SizeFilter for   2000558
2000559
 ...... Now Calculating SizeFilter for   2000559
2000560
 ...... Now Calculating SizeFilter for   2000560
2000561
 ...... Now Calculating SizeFilter for   2000561
2000563
 ...... Now Calculating SizeFilter for   2000563
2000564
 ...... Now Calculating SizeFilter for   2000564
2000565
 ...... Now Calculating SizeFilter for   2000565
2000566
 ...... Now Calculating SizeFilter for   2000566
2000567
 ...... Now Calculating SizeFilter for   2000567
2000568
 ...... Now Calculating SizeFilter for   2000568
2000570
 ...... Now Calculating SizeFilter for   2000570
2000571
 ...... Now Calculating SizeFilter for   2000571
2000572
 ...... Now Calculating SizeFilter for   2000572
2000573
 ...... Now Calculating SizeFilter for   2000573
2000576
 ...... Now Calculating SizeFilter for   2000576
2000581
 ...... Now Calculating SizeFilter for   2000581
2000582
 ...... Now Calculating SizeFilter for   2000582
2000584
 ...... Now Calculating

2000762
 ...... Now Calculating SizeFilter for   2000762
2000766
 ...... Now Calculating SizeFilter for   2000766
2000767
 ...... Now Calculating SizeFilter for   2000767
2000768
 ...... Now Calculating SizeFilter for   2000768
2000776
 ...... Now Calculating SizeFilter for   2000776
2000777
 ...... Now Calculating SizeFilter for   2000777
2000778
 ...... Now Calculating SizeFilter for   2000778
2000779
 ...... Now Calculating SizeFilter for   2000779
2000780
 ...... Now Calculating SizeFilter for   2000780
2000782
 ...... Now Calculating SizeFilter for   2000782
2000783
 ...... Now Calculating SizeFilter for   2000783
2000785
 ...... Now Calculating SizeFilter for   2000785
2000786
 ...... Now Calculating SizeFilter for   2000786
2000788
 ...... Now Calculating SizeFilter for   2000788
2000789
 ...... Now Calculating SizeFilter for   2000789
2000790
 ...... Now Calculating SizeFilter for   2000790
2000791
 ...... Now Calculating SizeFilter for   2000791
2000793
 ...... Now Calculating

2000976
 ...... Now Calculating SizeFilter for   2000976
2000977
 ...... Now Calculating SizeFilter for   2000977
2000978
 ...... Now Calculating SizeFilter for   2000978
2000980
 ...... Now Calculating SizeFilter for   2000980
2000981
 ...... Now Calculating SizeFilter for   2000981
2000982
 ...... Now Calculating SizeFilter for   2000982
2000983
 ...... Now Calculating SizeFilter for   2000983
2000985
 ...... Now Calculating SizeFilter for   2000985
2000987
 ...... Now Calculating SizeFilter for   2000987
2000988
 ...... Now Calculating SizeFilter for   2000988
2000989
 ...... Now Calculating SizeFilter for   2000989
2000990
 ...... Now Calculating SizeFilter for   2000990
2000993
 ...... Now Calculating SizeFilter for   2000993
2000996
 ...... Now Calculating SizeFilter for   2000996
2000997
 ...... Now Calculating SizeFilter for   2000997
2000998
 ...... Now Calculating SizeFilter for   2000998
2000999
 ...... Now Calculating SizeFilter for   2000999
2001696
 ...... Now Calculating

2002124
 ...... Now Calculating SizeFilter for   2002124
2002125
 ...... Now Calculating SizeFilter for   2002125
2002126
 ...... Now Calculating SizeFilter for   2002126
2002127
 ...... Now Calculating SizeFilter for   2002127
2002128
 ...... Now Calculating SizeFilter for   2002128
2002129
 ...... Now Calculating SizeFilter for   2002129
2002130
 ...... Now Calculating SizeFilter for   2002130
2002131
 ...... Now Calculating SizeFilter for   2002131
2002132
 ...... Now Calculating SizeFilter for   2002132
2002133
 ...... Now Calculating SizeFilter for   2002133
2002134
 ...... Now Calculating SizeFilter for   2002134
2002135
 ...... Now Calculating SizeFilter for   2002135
2002136
 ...... Now Calculating SizeFilter for   2002136
2002137
 ...... Now Calculating SizeFilter for   2002137
2002138
 ...... Now Calculating SizeFilter for   2002138
2002139
 ...... Now Calculating SizeFilter for   2002139
2002140
 ...... Now Calculating SizeFilter for   2002140
2002141
 ...... Now Calculating

2002272
 ...... Now Calculating SizeFilter for   2002272
2002273
 ...... Now Calculating SizeFilter for   2002273
2002274
 ...... Now Calculating SizeFilter for   2002274
2002275
 ...... Now Calculating SizeFilter for   2002275
2002276
 ...... Now Calculating SizeFilter for   2002276
2002277
 ...... Now Calculating SizeFilter for   2002277
2002278
 ...... Now Calculating SizeFilter for   2002278
2002279
 ...... Now Calculating SizeFilter for   2002279
2002280
 ...... Now Calculating SizeFilter for   2002280
2002281
 ...... Now Calculating SizeFilter for   2002281
2002282
 ...... Now Calculating SizeFilter for   2002282
2002283
 ...... Now Calculating SizeFilter for   2002283
2002284
 ...... Now Calculating SizeFilter for   2002284
2002285
 ...... Now Calculating SizeFilter for   2002285
2002286
 ...... Now Calculating SizeFilter for   2002286
2002287
 ...... Now Calculating SizeFilter for   2002287
2002288
 ...... Now Calculating SizeFilter for   2002288
2002289
 ...... Now Calculating

2002417
 ...... Now Calculating SizeFilter for   2002417
2002418
 ...... Now Calculating SizeFilter for   2002418
2002419
 ...... Now Calculating SizeFilter for   2002419
2002420
 ...... Now Calculating SizeFilter for   2002420
2002421
 ...... Now Calculating SizeFilter for   2002421
2002422
 ...... Now Calculating SizeFilter for   2002422
2002423
 ...... Now Calculating SizeFilter for   2002423
2002424
 ...... Now Calculating SizeFilter for   2002424
2002425
 ...... Now Calculating SizeFilter for   2002425
2002426
 ...... Now Calculating SizeFilter for   2002426
2002427
 ...... Now Calculating SizeFilter for   2002427
2002428
 ...... Now Calculating SizeFilter for   2002428
2002429
 ...... Now Calculating SizeFilter for   2002429
2002430
 ...... Now Calculating SizeFilter for   2002430
2002431
 ...... Now Calculating SizeFilter for   2002431
2002432
 ...... Now Calculating SizeFilter for   2002432
2002433
 ...... Now Calculating SizeFilter for   2002433
2002434
 ...... Now Calculating

2002565
 ...... Now Calculating SizeFilter for   2002565
2002566
 ...... Now Calculating SizeFilter for   2002566
2002567
 ...... Now Calculating SizeFilter for   2002567
2002568
 ...... Now Calculating SizeFilter for   2002568
2002569
 ...... Now Calculating SizeFilter for   2002569
2002570
 ...... Now Calculating SizeFilter for   2002570
2002571
 ...... Now Calculating SizeFilter for   2002571
2002572
 ...... Now Calculating SizeFilter for   2002572
2002573
 ...... Now Calculating SizeFilter for   2002573
2002574
 ...... Now Calculating SizeFilter for   2002574
2002575
 ...... Now Calculating SizeFilter for   2002575
2002576
 ...... Now Calculating SizeFilter for   2002576
2002577
 ...... Now Calculating SizeFilter for   2002577
2002578
 ...... Now Calculating SizeFilter for   2002578
2002579
 ...... Now Calculating SizeFilter for   2002579
2002580
 ...... Now Calculating SizeFilter for   2002580
2002581
 ...... Now Calculating SizeFilter for   2002581
2002582
 ...... Now Calculating

2002714
 ...... Now Calculating SizeFilter for   2002714
2002715
 ...... Now Calculating SizeFilter for   2002715
2002716
 ...... Now Calculating SizeFilter for   2002716
2002717
 ...... Now Calculating SizeFilter for   2002717
2002718
 ...... Now Calculating SizeFilter for   2002718
2002719
 ...... Now Calculating SizeFilter for   2002719
2002721
 ...... Now Calculating SizeFilter for   2002721
2002722
 ...... Now Calculating SizeFilter for   2002722
2002723
 ...... Now Calculating SizeFilter for   2002723
2002724
 ...... Now Calculating SizeFilter for   2002724
2002725
 ...... Now Calculating SizeFilter for   2002725
2002726
 ...... Now Calculating SizeFilter for   2002726
2002727
 ...... Now Calculating SizeFilter for   2002727
2002728
 ...... Now Calculating SizeFilter for   2002728
2002729
 ...... Now Calculating SizeFilter for   2002729
2002730
 ...... Now Calculating SizeFilter for   2002730
2002731
 ...... Now Calculating SizeFilter for   2002731
2002732
 ...... Now Calculating

2002869
 ...... Now Calculating SizeFilter for   2002869
2002870
 ...... Now Calculating SizeFilter for   2002870
2002871
 ...... Now Calculating SizeFilter for   2002871
2002872
 ...... Now Calculating SizeFilter for   2002872
2002873
 ...... Now Calculating SizeFilter for   2002873
2002875
 ...... Now Calculating SizeFilter for   2002875
2002876
 ...... Now Calculating SizeFilter for   2002876
2002877
 ...... Now Calculating SizeFilter for   2002877
2002878
 ...... Now Calculating SizeFilter for   2002878
2002879
 ...... Now Calculating SizeFilter for   2002879
2002880
 ...... Now Calculating SizeFilter for   2002880
2002881
 ...... Now Calculating SizeFilter for   2002881
2002882
 ...... Now Calculating SizeFilter for   2002882
2002883
 ...... Now Calculating SizeFilter for   2002883
2002884
 ...... Now Calculating SizeFilter for   2002884
2002885
 ...... Now Calculating SizeFilter for   2002885
2002886
 ...... Now Calculating SizeFilter for   2002886
2002887
 ...... Now Calculating

2300015
 ...... Now Calculating SizeFilter for   2300015
2300016
 ...... Now Calculating SizeFilter for   2300016
2300017
 ...... Now Calculating SizeFilter for   2300017
2300018
 ...... Now Calculating SizeFilter for   2300018
2300019
 ...... Now Calculating SizeFilter for   2300019
2300020
 ...... Now Calculating SizeFilter for   2300020
2300021
 ...... Now Calculating SizeFilter for   2300021
2300022
 ...... Now Calculating SizeFilter for   2300022
2300023
 ...... Now Calculating SizeFilter for   2300023
2300024
 ...... Now Calculating SizeFilter for   2300024
2300025
 ...... Now Calculating SizeFilter for   2300025
2300026
 ...... Now Calculating SizeFilter for   2300026
2300027
 ...... Now Calculating SizeFilter for   2300027
2300029
 ...... Now Calculating SizeFilter for   2300029
2300030
 ...... Now Calculating SizeFilter for   2300030
2300031
 ...... Now Calculating SizeFilter for   2300031
2300032
 ...... Now Calculating SizeFilter for   2300032
2300033
 ...... Now Calculating

2300163
 ...... Now Calculating SizeFilter for   2300163
2300164
 ...... Now Calculating SizeFilter for   2300164
2300165
 ...... Now Calculating SizeFilter for   2300165
2300166
 ...... Now Calculating SizeFilter for   2300166
2300167
 ...... Now Calculating SizeFilter for   2300167
2300168
 ...... Now Calculating SizeFilter for   2300168
2300169
 ...... Now Calculating SizeFilter for   2300169
2300170
 ...... Now Calculating SizeFilter for   2300170
2300171
 ...... Now Calculating SizeFilter for   2300171
2300172
 ...... Now Calculating SizeFilter for   2300172
2300173
 ...... Now Calculating SizeFilter for   2300173
2300174
 ...... Now Calculating SizeFilter for   2300174
2300175
 ...... Now Calculating SizeFilter for   2300175
2300176
 ...... Now Calculating SizeFilter for   2300176
2300177
 ...... Now Calculating SizeFilter for   2300177
2300178
 ...... Now Calculating SizeFilter for   2300178
2300179
 ...... Now Calculating SizeFilter for   2300179
2300180
 ...... Now Calculating

2300309
 ...... Now Calculating SizeFilter for   2300309
2300310
 ...... Now Calculating SizeFilter for   2300310
2300311
 ...... Now Calculating SizeFilter for   2300311
2300312
 ...... Now Calculating SizeFilter for   2300312
2300313
 ...... Now Calculating SizeFilter for   2300313
2300314
 ...... Now Calculating SizeFilter for   2300314
2300315
 ...... Now Calculating SizeFilter for   2300315
2300316
 ...... Now Calculating SizeFilter for   2300316
2300317
 ...... Now Calculating SizeFilter for   2300317
2300318
 ...... Now Calculating SizeFilter for   2300318
2300319
 ...... Now Calculating SizeFilter for   2300319
2300320
 ...... Now Calculating SizeFilter for   2300320
2300321
 ...... Now Calculating SizeFilter for   2300321
2300322
 ...... Now Calculating SizeFilter for   2300322
2300323
 ...... Now Calculating SizeFilter for   2300323
2300324
 ...... Now Calculating SizeFilter for   2300324
2300325
 ...... Now Calculating SizeFilter for   2300325
2300326
 ...... Now Calculating

2300456
 ...... Now Calculating SizeFilter for   2300456
2300457
 ...... Now Calculating SizeFilter for   2300457
2300458
 ...... Now Calculating SizeFilter for   2300458
2300459
 ...... Now Calculating SizeFilter for   2300459
2300460
 ...... Now Calculating SizeFilter for   2300460
2300461
 ...... Now Calculating SizeFilter for   2300461
2300462
 ...... Now Calculating SizeFilter for   2300462
2300463
 ...... Now Calculating SizeFilter for   2300463
2300464
 ...... Now Calculating SizeFilter for   2300464
2300465
 ...... Now Calculating SizeFilter for   2300465
2300466
 ...... Now Calculating SizeFilter for   2300466
2300467
 ...... Now Calculating SizeFilter for   2300467
2300468
 ...... Now Calculating SizeFilter for   2300468
2300469
 ...... Now Calculating SizeFilter for   2300469
2300470
 ...... Now Calculating SizeFilter for   2300470
2300471
 ...... Now Calculating SizeFilter for   2300471
2300472
 ...... Now Calculating SizeFilter for   2300472
2300473
 ...... Now Calculating

2300603
 ...... Now Calculating SizeFilter for   2300603
2300604
 ...... Now Calculating SizeFilter for   2300604
2300605
 ...... Now Calculating SizeFilter for   2300605
2300606
 ...... Now Calculating SizeFilter for   2300606
2300607
 ...... Now Calculating SizeFilter for   2300607
2300608
 ...... Now Calculating SizeFilter for   2300608
2300609
 ...... Now Calculating SizeFilter for   2300609
2300610
 ...... Now Calculating SizeFilter for   2300610
2300611
 ...... Now Calculating SizeFilter for   2300611
2300612
 ...... Now Calculating SizeFilter for   2300612
2300613
 ...... Now Calculating SizeFilter for   2300613
2300615
 ...... Now Calculating SizeFilter for   2300615
2300616
 ...... Now Calculating SizeFilter for   2300616
2300617
 ...... Now Calculating SizeFilter for   2300617
2300618
 ...... Now Calculating SizeFilter for   2300618
2300619
 ...... Now Calculating SizeFilter for   2300619
2300620
 ...... Now Calculating SizeFilter for   2300620
2300621
 ...... Now Calculating

2300755
 ...... Now Calculating SizeFilter for   2300755
2300756
 ...... Now Calculating SizeFilter for   2300756
2300757
 ...... Now Calculating SizeFilter for   2300757
2300758
 ...... Now Calculating SizeFilter for   2300758
2300759
 ...... Now Calculating SizeFilter for   2300759
2300760
 ...... Now Calculating SizeFilter for   2300760
2300761
 ...... Now Calculating SizeFilter for   2300761
2300762
 ...... Now Calculating SizeFilter for   2300762
2300763
 ...... Now Calculating SizeFilter for   2300763
2300765
 ...... Now Calculating SizeFilter for   2300765
2300766
 ...... Now Calculating SizeFilter for   2300766
2300767
 ...... Now Calculating SizeFilter for   2300767
2300768
 ...... Now Calculating SizeFilter for   2300768
2300769
 ...... Now Calculating SizeFilter for   2300769
2300770
 ...... Now Calculating SizeFilter for   2300770
2300771
 ...... Now Calculating SizeFilter for   2300771
2300772
 ...... Now Calculating SizeFilter for   2300772
2300773
 ...... Now Calculating

In [50]:
kk

Unnamed: 0,size_filter,date,skey
0,0.0,20200506,1600000
1,0.0,20200507,1600000
2,0.0,20200508,1600000
3,0.0,20200511,1600000
4,0.0,20200512,1600000
...,...,...,...
99,92178.0,20200924,1600000
100,92178.0,20200925,1600000
101,98494.0,20200928,1600000
102,98494.0,20200929,1600000


In [18]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    
    return query

def build_filter_query(start_date=None, end_date=None, skey=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if skey:
        if type(skey) == list or type(skey) == tuple:
            query['skey'] = {'$in': [parse_symbol(x) for x in skey]}
        else:
            query['skey'] = parse_symbol(skey)
    
    return query

def write_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['index_id'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['index_id'] == symbol)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 
    
def write_filter_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['skey'].unique():
        if symbol in collection.distinct('skey'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'skey':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['skey'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['skey'] == symbol)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 
    
def write_weight_data(db, name, df, index_id):
    collection = db[name]
    df = df[df['index_id'] == index_id]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        weight = sub_df['weight'].values[0]
        print(weight)
        collection.update({'skey': skey, 'date':date, 'index_id':index_id}, {'$set':{'weight':float(weight)}})

def delete_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)  
    
def delete_filter_data(db, name, start_date=None, end_date=None, skey=None):
    collection = db[name]
    query = build_filter_query(start_date, end_date, skey)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
 
def read_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    

def read_filter_daily(db, name, start_date=None, end_date=None, skey=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date','skey'])
    return df  


database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

In [None]:
import os
import glob
import datetime
import numpy as np
import pandas as pd

readPath = '/mnt/e/filter/***'
dataPathLs = np.array(glob.glob(readPath))
for i in dataPathLs:
    filter_data = pd.read_pickle(i)
    write_filter_data(db1, 'md_stock_sizefilter', filter_data)