In [2]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz
import numpy as np
import TSLPy3

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def read_daily(self, table_name, start_date=None, end_date=None, index_id=None, skey=None, interval=None, col=None,
                   return_sdi=True):
        collection = self.db[table_name]
        # Build projection
        prj = {'_id': 0}
        if col is not None:
            if return_sdi:
                col = ['skey', 'date', 'interval'] + col
            for col_name in col:
                prj[col_name] = 1

        # Build query
        query = {}
        if skey is not None:
            query['skey'] = {'$in': skey}
        if index_id is not None:
            query['index_id'] = {'$in': index_id}
        if interval is not None:
            query['interval'] = {'$in': interval}
        if start_date is not None:
            if end_date is not None:
                query['date'] = {'$gte': start_date, '$lte': end_date}
            else:
                query['date'] = {'$gte': start_date}
        elif end_date is not None:
            query['date'] = {'$lte': end_date}

        # Load data
        cur = collection.find(query, prj)
        df = pd.DataFrame.from_records(cur)
        if df.empty:
            df = pd.DataFrame()
        else:
            df = df.sort_values(by=['date', 'index_id', 'skey'])
        return df

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m


patch_pandas_pickle()

def DB1(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db


def write_filter_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['skey'].unique():
        if symbol in collection.distinct('skey'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'skey':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
#             df2 = df[(df['skey'] == symbol) & (df['date'] > m_ax)]
            df2 = df[(df['skey'] == symbol)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['skey'] == symbol)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 

def build_filter_query(start_date=None, end_date=None, skey=None):
    query = {}
    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("date must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid date type: " + str(type(x)))
    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)
    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)
    if skey:
        if type(skey) == list or type(skey) == tuple:
            query['skey'] = {'$in': [parse_symbol(x) for x in skey]}
        else:
            query['skey'] = parse_symbol(skey)
    return query

def delete_filter_data(db, name, start_date=None, end_date=None, skey=None):
    collection = db[name]
    query = build_filter_query(start_date, end_date, skey)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    

def read_filter_daily(db, name, start_date=None, end_date=None, skey=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date','skey'])
    return df  


database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB1("192.168.10.178", database_name, user, password)

def getTradeDate(begT, endT):
    tsstr = """    begt := %s;
                   endt := %s;
                   begt_:=datetoint(begt);
                   endt_:=datetoint(endt);
                   dayArr:=sselect inttodate(['截止日']) from infotable 753 of 'SH000001'
                               where ['截止日']>=begt_
                                     and ['截止日']<=endt_
                                     and ['是否交易日']=1
                                     order by ['截止日'] end;
                   if not istable(dayArr) then endt1:=endt;
                   else endt1:=dayArr[0];
                   hisArr:=MarketTradeDayQk(begt,endt1);
                   dateDf := hisArr union2 dayArr;
                   dateDf := select [0] as 'date' from `dateDf end;
                   dateDf[:]['date'] := datetostr(dateDf[:]['date']);
                   return dateDf; """ % (begT + 'T', endT + 'T')
    dateDf = pd.DataFrame(TSLPy3.RemoteExecute(tsstr, {})[1])
    dateDf.columns = list(pd.Series(dateDf.columns).str.decode('GBK'))
    dateDf['date'] = dateDf['date'].str.decode('GBK')

    return dateDf

# ## Version one
# def sta_sizeFilter(stockID, startDate, endDate, regWindowSize=20, weekInterval=1):
#     database_name = 'com_md_eq_cn'
#     user = "zhenyuy"
#     password = "bnONBrzSMGoE"

#     pd.set_option('max_columns', 200)
#     db = DB("192.168.10.178", database_name, user, password)

#     print(' ...... Now Calculating SizeFilter for  ', stockID)
#     #    startTm = datetime.datetime.now()
#     stockData = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate, symbol=[stockID])
#     stockData = stockData.loc[((stockData.bid1p != 0) | (stockData.ask1p != 0)), \
#                               ['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ordering',
#                                'cum_amount', 'bid1p', 'bid1q', 'bid5q', 'ask5q']].reset_index(drop=True)
#     stockData = stockData[((stockData.time >= 93000000000) & (stockData.time <= 113000000000)) | \
#                           ((stockData.time >= 130000000000) & (stockData.time <= 150000000000))].reset_index(drop=True)
#     indexDaily = db.read_daily('index_memb', start_date=startDate, end_date=endDate, index_id=[1000905])
#     indexDaily['tradeConsDay'] = indexDaily.groupby(['date']).grouper.group_info[0]
#     indexDaily = indexDaily.groupby('date')['tradeConsDay'].first().reset_index()
#     df_train = stockData.merge(indexDaily[['date', 'tradeConsDay']], how='left', on=['date'], validate='many_to_one')

#     df_train = df_train[(df_train['time'] >= 93000000000) & (df_train['time'] < 145655000000)].reset_index(drop=True)
#     groupAllData = df_train.groupby(['skey', 'date'])
#     df_train['amountThisUpdate'] = df_train.cum_amount - groupAllData['cum_amount'].shift(1)
#     df_train['amountThisUpdate'] = np.where(pd.isnull(df_train.amountThisUpdate), df_train.cum_amount,
#                                             df_train.amountThisUpdate)

#     ### add useful day indicator
#     df_train['curNearLimit'] = np.where((df_train.ask5q == 0) | (df_train.bid5q == 0), 1.0, 0.0)
#     df_train['curNearLimit_L1'] = groupAllData['curNearLimit'].shift(1)
#     df_train['dailyCount'] = groupAllData['time'].transform('count')
#     df_train['nearLimitCount'] = groupAllData['curNearLimit'].transform('sum')
#     dateInfo = groupAllData['dailyCount', 'nearLimitCount', 'tradeConsDay'].mean().reset_index()
#     del groupAllData
#     dateInfo['useFlag'] = np.where(dateInfo['nearLimitCount'] * 2 < dateInfo['dailyCount'], 1, 0)
#     dateInfo['useConsDay'] = dateInfo['useFlag'].cumsum()
#     df_train = pd.merge(df_train, dateInfo[['date', 'tradeConsDay', 'useFlag', 'useConsDay']],
#                         how='left', on=['date', 'tradeConsDay'], validate='many_to_one')

#     df_train['weekday'] = df_train['datetime'].dt.weekday
#     sizeFilterData = df_train.groupby(['date'])['tradeConsDay'].first().reset_index()
#     sizeFilterData['amountFilter'] = np.nan
#     ## we only update on Thrusday
#     regDays = sorted(list(df_train.loc[df_train.weekday == 3, 'tradeConsDay'].unique()))

#     weekInterval = 1
#     for d in range(int(regWindowSize / 5), len(regDays), weekInterval):
#         amountFilter = np.nan
#         ## get current Thrusday
#         endTradeConsDay = regDays[d]
#         endUseConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['useConsDay'].values[0]
#         startUseConsDay = max(endUseConsDay - regWindowSize + 1, 1)

#         ## check 60 consecutive days
#         if dateInfo['useConsDay'].max() < 1:
#             amountFilter = np.nan
#             continue
#         startTradeConsDay = dateInfo[dateInfo['useConsDay'] == startUseConsDay]['tradeConsDay'].values[0]
#         endTradeConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['tradeConsDay'].values[0]
#         if (endTradeConsDay - startTradeConsDay > 59) or (endUseConsDay - startUseConsDay < 9):
#             amountFilter = np.nan
#             continue
#             ## get the Monday right after current Thursday update
#         oss_intdate = df_train.loc[df_train.tradeConsDay == endTradeConsDay, 'date'].unique()[0]
#         oss_intdate = (datetime.datetime.strptime(str(oss_intdate), '%Y%m%d') - datetime.datetime(1899, 12,
#                                                                                                   30)).days + 4
#         oss = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(oss_intdate))).strftime('%Y%m%d'))
#         ## get the Friday right after next Thursday update
#         if d >= len(regDays) - weekInterval:
#             # we should update the data from this Friday to next Thursday
#             try:
#                 assert(df_train.loc[df_train['tradeConsDay'] == regDays[d], 'date'].values[0] == df_train.date.max())
#                 next_t = (datetime.datetime.strptime(str(df_train.date.max()), '%Y%m%d') - datetime.datetime(1899, 12,
#                                                                                                            30)).days + 7
#                 next_t = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(next_t))).strftime('%Y%m%d'))
#                 m = 2
#                 n_next_t = getTradeDate(str(df_train.date.max()), str(next_t))['date'].astype(str).apply(lambda x: int(x.replace('-', ''))).max()
#                 while n_next_t != next_t:
#                     next_t = (datetime.datetime.strptime(str(df_train.date.max()), '%Y%m%d') - datetime.datetime(1899,
#                                                                                                                  12,
#                                                                                                                  30)).days + 7 * m
#                     next_t = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(next_t))).strftime('%Y%m%d'))
#                     n_next_t = getTradeDate(str(df_train.date.max()), str(next_t))['date'].astype(str).apply(
#                         lambda x: int(x.replace('-', ''))).max()
#                     print(next_t)
#                     print(n_next_t)
#                     m = m + 1
#                 assert(datetime.datetime.strptime(str(next_t), '%Y%m%d').weekday() == 3)
#                 ose = (datetime.datetime.strptime(str(next_t), '%Y%m%d') - datetime.datetime(1899, 12,
#                                                                                                            30)).days + 1
#                 ose = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(ose))).strftime('%Y%m%d'))
#                 add = getTradeDate(str(df_train.date.max()), str(ose))['date'].astype(str).apply(
#                         lambda x: int(x.replace('-', '')))
#                 for i in add:
#                     if i in sizeFilterData['date'].values:
#                         continue
#                     else:
#                         if (datetime.datetime.strptime(str(i), '%Y%m%d') - datetime.datetime.strptime(str(df_train.date.max()), '%Y%m%d')).days == 1:
#                             sizeFilterData = sizeFilterData.append(
#                                 pd.DataFrame([[i, sizeFilterData.tradeConsDay.max() + 1, sizeFilterData.loc[sizeFilterData['date'] == df_train.date.max(), 'amountFilter'].values[0]]],
#                                              columns=['date', 'tradeConsDay', 'amountFilter']))
#                         else:
#                             sizeFilterData = sizeFilterData.append(
#                             pd.DataFrame([[i, sizeFilterData.tradeConsDay.max() + 1, np.nan]],
#                                          columns=['date', 'tradeConsDay', 'amountFilter']))
#             except:
#                 ose = df_train.date.max()
#         else:
#             ose_intdate = df_train.loc[df_train.tradeConsDay == regDays[d + weekInterval], 'date'].unique()[0]
#             ose_intdate = (datetime.datetime.strptime(str(ose_intdate), '%Y%m%d') - datetime.datetime(1899, 12,
#                                                                                                       30)).days + 1
#             ose = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(ose_intdate))).strftime('%Y%m%d'))
#         inSampleSlice = df_train[(df_train.useConsDay >= startUseConsDay) & \
#                                  (df_train.useConsDay <= endUseConsDay) & \
#                                  (df_train.useFlag == 1)].reset_index(drop=True)
#         amountFilter = inSampleSlice[(inSampleSlice['curNearLimit'] == 0) & \
#                                      (inSampleSlice['curNearLimit_L1'] == 0)].amountThisUpdate.quantile(.75)
#         if ose < oss:
#             print('out of sample end day < start day, skip')
#             continue
#         sizeFilterData.loc[(sizeFilterData.date >= oss) & (sizeFilterData.date <= ose), 'amountFilter'] = amountFilter
#     sizeFilterData['skey'] = int(stockID)
#     sizeFilterData = sizeFilterData[['skey', 'date', 'amountFilter']]
#     sizeFilterData['amountFilter'] = sizeFilterData['amountFilter'].fillna(0)
#     sizeFilterData = sizeFilterData.rename(columns={'amountFilter': 'size_filter'})
#     sizeFilterData = sizeFilterData.sort_values(by='date').reset_index(drop=True)
# #     if sizeFilterData.empty == False:
# #         write_filter_data(db1, 'md_stock_sizefilter', sizeFilterData)
#     return sizeFilterData

# Version two
def sta_sizeFilter(stockID, startDate, endDate, regWindowSize=20, weekInterval=1):
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    pd.set_option('max_columns', 200)
    db = DB("192.168.10.178", database_name, user, password)

    print(' ...... Now Calculating SizeFilter for  ', stockID)
    #    startTm = datetime.datetime.now()
    stockData = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate, symbol=[stockID])
    stockData = stockData.loc[((stockData.bid1p != 0) | (stockData.ask1p != 0)), \
                              ['skey', 'date', 'time', 'clockAtArrival', 'datetime', 'ordering',
                               'cum_amount', 'bid1p', 'bid1q', 'bid5q', 'ask5q']].reset_index(drop=True)
    stockData = stockData[((stockData.time >= 93000000000) & (stockData.time <= 113000000000)) | \
                          ((stockData.time >= 130000000000) & (stockData.time <= 150000000000))].reset_index(drop=True)
    indexDaily = db.read_daily('index_memb', start_date=startDate, end_date=endDate, index_id=[1000905])
    indexDaily['tradeConsDay'] = indexDaily.groupby(['date']).grouper.group_info[0]
    indexDaily = indexDaily.groupby('date')['tradeConsDay'].first().reset_index()
    df_train = stockData.merge(indexDaily[['date', 'tradeConsDay']], how='left', on=['date'], validate='many_to_one')

    df_train = df_train[(df_train['time'] >= 93000000000) & (df_train['time'] < 145655000000)].reset_index(drop=True)
    groupAllData = df_train.groupby(['skey', 'date'])
    df_train['amountThisUpdate'] = df_train.cum_amount - groupAllData['cum_amount'].shift(1)
    df_train['amountThisUpdate'] = np.where(pd.isnull(df_train.amountThisUpdate), df_train.cum_amount,
                                            df_train.amountThisUpdate)

    ### add useful day indicator
    df_train['curNearLimit'] = np.where((df_train.ask5q == 0) | (df_train.bid5q == 0), 1.0, 0.0)
    df_train['curNearLimit_L1'] = groupAllData['curNearLimit'].shift(1)
    df_train['dailyCount'] = groupAllData['time'].transform('count')
    df_train['nearLimitCount'] = groupAllData['curNearLimit'].transform('sum')
    dateInfo = groupAllData['dailyCount', 'nearLimitCount', 'tradeConsDay'].mean().reset_index()
    del groupAllData
    dateInfo['useFlag'] = np.where(dateInfo['nearLimitCount'] * 2 < dateInfo['dailyCount'], 1, 0)
    dateInfo['useConsDay'] = dateInfo['useFlag'].cumsum()
    df_train = pd.merge(df_train, dateInfo[['date', 'tradeConsDay', 'useFlag', 'useConsDay']],
                        how='left', on=['date', 'tradeConsDay'], validate='many_to_one')

    df_train['weekday'] = df_train['datetime'].dt.weekday
    sizeFilterData = df_train.groupby(['date'])['tradeConsDay'].first().reset_index()
    sizeFilterData['amountFilter'] = np.nan
    
    regDays = sorted(list(df_train.loc[df_train.weekday == 3, 'tradeConsDay'].unique()))
    
    assert(datetime.datetime.strptime(str(endDate), "%Y%m%d").weekday() == 3)
    if sizeFilterData.empty == False:
        try:
            endTradeConsDay = regDays[-1]
            endUseConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['useConsDay'].values[0]
            startUseConsDay = max(endUseConsDay - regWindowSize + 1, 1)

            startTradeConsDay = dateInfo[dateInfo['useConsDay'] == startUseConsDay]['tradeConsDay'].values[0]
            endTradeConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['tradeConsDay'].values[0]

            ## check 60 consecutive days
            if (dateInfo['useConsDay'].max() < 1) | (endTradeConsDay - startTradeConsDay > 59) | (endUseConsDay - startUseConsDay < 9):
                amountFilter = np.nan
            else:
                inSampleSlice = df_train[(df_train.useConsDay >= startUseConsDay) & \
                                         (df_train.useConsDay <= endUseConsDay) & \
                                         (df_train.useFlag == 1)].reset_index(drop=True)
                amountFilter = inSampleSlice[(inSampleSlice['curNearLimit'] == 0) & \
                                             (inSampleSlice['curNearLimit_L1'] == 0)].amountThisUpdate.quantile(.75)
        except:
            amountFilter = np.nan

        oss_intdate = endDate
        oss_intdate = (datetime.datetime.strptime(str(oss_intdate), '%Y%m%d') - datetime.datetime(1899, 12, 30)).days + 4
        oss = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(oss_intdate))).strftime('%Y%m%d'))

        next_t = (datetime.datetime.strptime(str(endDate), '%Y%m%d') - datetime.datetime(1899, 12, 30)).days + 7
        next_t = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(next_t))).strftime('%Y%m%d'))
        m = 2
        n_next_t = getTradeDate(str(df_train.date.max()), str(next_t))['date'].astype(str).apply(lambda x: int(x.replace('-', ''))).max()
        while n_next_t != next_t:
            next_t = (datetime.datetime.strptime(str(endDate), '%Y%m%d') - datetime.datetime(1899, 12, 30)).days + 7 * m
            next_t = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(next_t))).strftime('%Y%m%d'))
            n_next_t = getTradeDate(str(endDate), str(next_t))['date'].astype(str).apply(
                lambda x: int(x.replace('-', ''))).max()
            m = m + 1
        assert(datetime.datetime.strptime(str(next_t), '%Y%m%d').weekday() == 3)
        ose = (datetime.datetime.strptime(str(next_t), '%Y%m%d') - datetime.datetime(1899, 12, 30)).days + 1
        ose = int((datetime.datetime(1899, 12, 30) + datetime.timedelta(int(ose))).strftime('%Y%m%d'))
        add = getTradeDate(str(oss), str(ose))['date'].astype(str).apply(
                lambda x: int(x.replace('-', '')))
        re = pd.DataFrame()
        m = 1
        for i in add:
            re = pd.concat([re, pd.DataFrame([[i, sizeFilterData.tradeConsDay.max() + m, np.nan]],
                             columns=['date', 'tradeConsDay', 'amountFilter'])])
            m = m + 1

        re.loc[(re.date >= oss) & (re.date <= ose), 'amountFilter'] = amountFilter
        re['skey'] = int(stockID)
        re = re[['skey', 'date', 'amountFilter']]
        re['amountFilter'] = re['amountFilter'].fillna(0)
        re = re.rename(columns={'amountFilter': 'size_filter'})
        re = re.sort_values(by='date').reset_index(drop=True)

        write_filter_data(db1, 'md_stock_sizefilter', re)
    else:
        return 0

# # Version three 0501-1016?? (0501-1015, 1016用之前预测的 Version 2 0501-0924 fill up, 
# # 是否会出现停牌时间特别长的情况，和当前database数据最大值的差值是多少？？？)
# def sta_sizeFilter(stockID, startDate, endDate, regWindowSize = 20, weekInterval = 1): 
#     database_name = 'com_md_eq_cn'
#     user = "zhenyuy"
#     password = "bnONBrzSMGoE"

#     pd.set_option('max_columns', 200)
#     db = DB("192.168.10.178", database_name, user, password)

#     print(' ...... Now Calculating SizeFilter for  ', stockID)
#     #    startTm = datetime.datetime.now()
#     stockData = db.read('md_snapshot_l2', start_date=startDate, end_date=endDate, symbol=[stockID])
#     stockData = stockData.loc[((stockData.bid1p != 0) | (stockData.ask1p != 0)), \
#                               ['skey','date','time','clockAtArrival','datetime','ordering',
#                                'cum_amount','bid1p','bid1q','bid5q','ask5q']].reset_index(drop=True)
#     stockData = stockData[((stockData.time >= 93000000000) & (stockData.time <= 113000000000)) | \
#                           ((stockData.time >= 130000000000) & (stockData.time <= 150000000000))].reset_index(drop=True)
#     indexDaily = db.read_daily('index_memb', start_date=startDate, end_date=endDate, index_id=[1000905])
#     indexDaily['tradeConsDay'] = indexDaily.groupby(['date']).grouper.group_info[0]
#     indexDaily = indexDaily.groupby('date')['tradeConsDay'].first().reset_index()
#     df_train = stockData.merge(indexDaily[['date','tradeConsDay']], how='left', on=['date'], validate='many_to_one')
    
#     df_train = df_train[(df_train['time'] >= 93000000000) & (df_train['time'] < 145655000000)].reset_index(drop=True)
#     groupAllData = df_train.groupby(['skey','date'])
#     df_train['amountThisUpdate'] = df_train.cum_amount - groupAllData['cum_amount'].shift(1)
#     df_train['amountThisUpdate'] = np.where(pd.isnull(df_train.amountThisUpdate), df_train.cum_amount, df_train.amountThisUpdate)

#     ### add useful day indicator
#     df_train['curNearLimit'] = np.where((df_train.ask5q == 0) | (df_train.bid5q == 0), 1.0, 0.0)
#     df_train['curNearLimit_L1'] = groupAllData['curNearLimit'].shift(1)
#     df_train['dailyCount'] = groupAllData['time'].transform('count')
#     df_train['nearLimitCount'] = groupAllData['curNearLimit'].transform('sum')
#     dateInfo = groupAllData['dailyCount', 'nearLimitCount', 'tradeConsDay'].mean().reset_index()
#     del groupAllData
#     dateInfo['useFlag'] = np.where(dateInfo['nearLimitCount']*2 < dateInfo['dailyCount'], 1, 0)
#     dateInfo['useConsDay'] = dateInfo['useFlag'].cumsum()
#     df_train = pd.merge(df_train, dateInfo[['date', 'tradeConsDay', 'useFlag', 'useConsDay']],
#                         how='left', on=['date', 'tradeConsDay'], validate='many_to_one') 
    
#     df_train['weekday'] = df_train['datetime'].dt.weekday
#     sizeFilterData = df_train.groupby(['date'])['tradeConsDay'].first().reset_index()
#     sizeFilterData['amountFilter'] = np.nan
#     ## we only update on Thrusday
#     regDays = sorted(list(df_train.loc[df_train.weekday == 3, 'tradeConsDay'].unique()))      
#     weekInterval = 1    
#     for d in range(int(regWindowSize/5), len(regDays), weekInterval):
#         amountFilter = np.nan
#         ## get current Thrusday
#         endTradeConsDay = regDays[d]
#         endUseConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['useConsDay'].values[0]
#         startUseConsDay = max(endUseConsDay - regWindowSize + 1, 1)
        
#         ## check 60 consecutive days
#         if dateInfo['useConsDay'].max() < 1: 
#             amountFilter = np.nan
#             continue
#         startTradeConsDay = dateInfo[dateInfo['useConsDay'] == startUseConsDay]['tradeConsDay'].values[0]
#         endTradeConsDay = dateInfo[dateInfo['tradeConsDay'] == endTradeConsDay]['tradeConsDay'].values[0]
#         if (endTradeConsDay - startTradeConsDay > 59) or (endUseConsDay - startUseConsDay < 9):
#             amountFilter = np.nan
#             continue      
#         ## get the Monday right after current Thursday update
#         oss_intdate = df_train.loc[df_train.tradeConsDay == endTradeConsDay, 'date'].unique()[0]
#         oss_intdate = (datetime.datetime.strptime(str(oss_intdate), '%Y%m%d') - datetime.datetime(1899,12,30)).days + 4
#         oss = int((datetime.datetime(1899,12,30) + datetime.timedelta(int(oss_intdate))).strftime('%Y%m%d'))
#         ## get the Friday right after next Thursday update
#         if d >= len(regDays) - weekInterval:
#             ose = df_train.date.max()
#         else:
#             ose_intdate = df_train.loc[df_train.tradeConsDay == regDays[d+weekInterval], 'date'].unique()[0]
#             ose_intdate = (datetime.datetime.strptime(str(ose_intdate), '%Y%m%d') - datetime.datetime(1899,12,30)).days + 1
#             ose =  int((datetime.datetime(1899,12,30) + datetime.timedelta(int(ose_intdate))).strftime('%Y%m%d'))         
#         inSampleSlice = df_train[(df_train.useConsDay >= startUseConsDay) &\
#                                (df_train.useConsDay <= endUseConsDay) &\
#                                (df_train.useFlag == 1)].reset_index(drop=True)
#         amountFilter = inSampleSlice[(inSampleSlice['curNearLimit'] == 0) &\
#                                      (inSampleSlice['curNearLimit_L1'] == 0)].amountThisUpdate.quantile(.75)
#         print(oss)
#         print(ose)
#         if ose < oss:
#             print('out of sample end day < start day, skip')
#             continue 
#         sizeFilterData.loc[(sizeFilterData.date >= oss)&(sizeFilterData.date <= ose), 'amountFilter'] = amountFilter
#     sizeFilterData['skey'] = int(stockID)
#     sizeFilterData = sizeFilterData[['skey', 'date', 'amountFilter']]
#     sizeFilterData['amountFilter'] = sizeFilterData['amountFilter'].fillna(0)
#     sizeFilterData = sizeFilterData.rename(columns={'amountFilter': 'size_filter'})
#     sizeFilterData = sizeFilterData.sort_values(by='date').reset_index(drop=True)
#     if sizeFilterData.empty == False:
#         write_filter_data(db1, 'md_stock_sizefilter', sizeFilterData)
#     return sizeFilterData

In [6]:
sta_sizeFilter(2002297,startDate=20200601, endDate=20201022, 
                     regWindowSize = 20, weekInterval = 1)

 ...... Now Calculating SizeFilter for   2002299
      skey      date  size_filter
0  2002299  20201026     35329.25
1  2002299  20201027     35329.25
2  2002299  20201028     35329.25
3  2002299  20201029     35329.25
4  2002299  20201030     35329.25


In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
import os
from datetime import datetime
pd.set_option("max_columns", 200)


def unzip_data():
    y = '20201106'
    y1 = '202011'
    exe_path = 'G:\\7z1900-extra\\7za.exe'
    rar_path = 'L:\\KR\\data\\quant360_data\\2020\\' + y1 + '\\' + y + '\\SH\\snapshot.7z'
    path = '\\\\192.168.10.34\\random_backup\\Kevin_zhenyu\\KR_daily_data\\' +  y
    os.mkdir(path)
    path1 = path + '\\SH'
    un_path = path1
    cmd = '{} x {} -o{} -aos -r'.format(exe_path, rar_path, un_path)
    os.system(cmd)
    rar_path = 'L:\\KR\\data\\quant360_data\\2020\\' + y1 + '\\' + y + '\\SH\\tick.7z'
    cmd = '{} x {} -o{} -aos -r'.format(exe_path, rar_path, un_path)
    os.system(cmd)
    rar_path = 'L:\\KR\\data\\quant360_data\\2020\\' + y1 + '\\' + y + '\\SZ\\snapshot.7z'
    path1 = path + '\\SZ'
    un_path = path1
    cmd = '{} x {} -o{} -aos -r'.format(exe_path, rar_path, un_path)
    os.system(cmd)
    rar_path = 'L:\\KR\\data\\quant360_data\\2020\\' + y1 + '\\' + y + '\\SZ\\order.7z'
    cmd = '{} x {} -o{} -aos -r'.format(exe_path, rar_path, un_path)
    os.system(cmd)
    rar_path = 'L:\\KR\\data\\quant360_data\\2020\\' + y1 + '\\' + y + '\\SZ\\tick.7z'
    cmd = '{} x {} -o{} -aos -r'.format(exe_path, rar_path, un_path)
    os.system(cmd)
    

if __name__ == '__main__':
    unzip_data()

