In [None]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()







import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
import TSLPy3


ul = pd.read_csv(r'D:\work\project 17 AMAC\tickStockList_AMAC.csv')
ul = ul['StockID'].values

startDate = '20201123T'
endDate = '20201123T'
for num in range(len(ul)):
    stock=ul[num]
    tickname = 'Tick_'+ stock
    if num%10 == 0: print('Processing ' + str(num) +' AMAC '+stock)
    tsstr="""
           BegT :=%s;
           EndT :=%s + 0.99;
           setSysParam(pn_stock(),'%s');
           returnData := select ['date'],['close'],['sectional_open'],['sectional_vol'],['sectional_amount']
                         from tradetable datekey BegT to EndT of DefaultStockID() end;
           return returnData;
           """%(startDate,endDate,stock)
    Tick_Stock = pd.DataFrame(TSLPy3.RemoteExecute(tsstr,{})[1])
    Tick_Stock.columns = list(pd.Series(Tick_Stock.columns).str.decode('GBK'))
    Tick_Stock['intdate'] = Tick_Stock.date.astype(int)
    Tick_Stock['time'] = Tick_Stock.date.map(lambda x: datetime.datetime.utcfromtimestamp(round((x - 25569) * 86400.0)))
    Tick_Stock['adjTime'] = Tick_Stock.date.map(lambda x: datetime.datetime.utcfromtimestamp(round((x - 25569) * 86400.0) - 1))
    Tick_Stock['minute'] = Tick_Stock.adjTime.map(lambda x: (x.hour*60 + x.minute + 1))
    assert (Tick_Stock.minute.max() >= 900) & (Tick_Stock.minute.min() <= 570)
    Tick_Stock['morning'] = np.where(Tick_Stock.minute <= 690, 1, 0)          
    Tick_Stock.rename(columns = {'sectional_open':'industry_open','sectional_vol':'cum_volume','sectional_amount':'cum_amount'}, inplace=True)            
    Tick_Stock = Tick_Stock[['intdate','minute','morning','time','close','industry_open','cum_volume','cum_amount']].reset_index(drop = True)
    Tick_Stock['ID'] = stock
    ## ordering per day per stock
    for intD in Tick_Stock.intdate.unique():
        Tick_Stock.loc[Tick_Stock.intdate == intD, 'ordering'] = range(0, len(Tick_Stock.loc[Tick_Stock.intdate == intD, 'ID']))
    Tick_Stock['month'] = Tick_Stock.time.dt.month + Tick_Stock.time.dt.year * 100
    test = Tick_Stock

    test['date'] = test['time'].astype(str).apply(lambda x: int(x.split(' ')[0].replace("-", "")))
    test['time'] = test['time'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(":", "")))
    test['datetime'] = test['date'] * 1000000 + test['time']
    test["clockAtArrival"] = test["datetime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    test['datetime'] = test["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))
    test['time'] = test['time'].astype('int64') * 1000000
    test['skey'] = test['ID'].str[4:].astype(int) + 3000000
    test = test.rename(columns={'industry_open':"open"})
    test['open1'] = test.groupby(['skey', 'date'])['open'].transform('max')
    test = test.sort_values(by=['skey', 'date', 'ordering'])
    test['close1'] = test.groupby(['skey'])['close'].shift(1)
    # 每天只有一条tick cum_volume==0, 且当时open==today's open, close==yesterday's close
    # 20180215, 20180220, 20180221不满足条件，这几天无交易，close=0；20180215, 20180222 close close1也无法对上
    assert(sum(test[test['cum_volume'] == 0].groupby(['skey', 'date'])['ordering'].size() != 1) == 0)
    assert(sum(test[test['cum_volume'] == 0].groupby(['skey', 'date'])['ordering'].unique() != 0) == 0)
    try:
        assert((test[test['cum_volume'] == 0]['open'].min() > 0) & (test[test['open'] != test['open1']].shape[0] == 0))
        assert(sum(test[(test['cum_volume'] == 0) & (~test['close1'].isnull())]['close'] != 
        test[(test['cum_volume'] == 0) & (~test['close1'].isnull())]['close1']) == 0)
    except:
        print(test[(test['cum_volume'] == 0) & (test['open'] == 0)]['datetime'].unique())
        print(test[(test['cum_volume'] == 0) & (~test['close1'].isnull())][test[(test['cum_volume'] == 0) & (~test['close1'].isnull())]['close'] != 
        test[(test['cum_volume'] == 0) & (~test['close1'].isnull())]['close1']]['datetime'].unique())
    test = test[test['cum_volume'] != 0]
    test = test.sort_values(by=['skey', 'date', 'ordering'])
    test = test[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
                 "open", "close"]]
    # change to second level tick data
    k1 = test.groupby(['date', 'skey'])['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = test.groupby(['date', 'skey'])['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on=['date', 'skey'])
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)

    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        df1['date'] = k.loc[i, 'date']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    test = pd.merge(test, df, left_on=['skey', 'datetime', 'date'], right_on=['skey', 'datetime1', 'date'], how='outer').sort_values(by=['skey', 'date', 'datetime1']).reset_index(drop=True)
    assert(test[test['datetime1'].isnull()].shape[0] == 0)
    for cols in ['cum_volume', 'cum_amount', 'open', 'close']:
        test[cols] = test.groupby(['skey', 'date'])[cols].ffill()
    test.drop(["datetime"],axis=1,inplace=True)
    test = test.rename(columns={'datetime1':'datetime'})
    test['skey'] = test['skey'].astype('int32')
    test["time"] = test['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    test['SendingTime'] = test['date'] * 1000000 + test['time']
    test["clockAtArrival"] = test["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    test.drop(["SendingTime"],axis=1,inplace=True)
    test['time'] = test['time'] * 1000000

    assert(sum(test[test["open"] != 0].groupby(["skey", 'date'])["open"].nunique() != 1) == 0)
    test["open"] = np.where(test["cum_volume"] > 0, test.groupby(["skey", 'date'])["open"].transform("max"), test["open"])
    assert(sum(test[test["open"] != 0].groupby(["skey", 'date'])["open"].nunique() != 1) == 0)
    assert(test[test["cum_volume"] > 0]["open"].min() > 0)

    test['date'] = test['date'].astype('int32')
    test['cum_volume'] = test['cum_volume'].astype('int64')

    m_in = test[test['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = test[test['time'] >= 125500000000].groupby('skey').first()['time'].max()
    assert(test[(test['time'] >= m_in) & (test['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open',  
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0
           & (sum(test[(test['time'] >= m_in) & (test['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(test[(test['time'] >= m_in) & (test['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
    test = pd.concat([test[test['time'] <= 113500000000], test[test['time'] >= 125500000000]])

    test = test.sort_values(by=["skey", 'date', 'time'])
    test["ordering"] = test.groupby(["skey", 'date']).cumcount() + 1
    test['ordering'] = test['ordering'].astype('int32')

    for cols in ['open', 'cum_amount', 'close']:
        test[cols] = test[cols].apply(lambda x: round(x, 4)).astype('float64')
    assert(test['time'].max() < 150500000000)

    test = test[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]

    print("index finished")

    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', test)

    del test
