In [2]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2020"
startDate = '20200102'
endDate = '20200630'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
less = []

for data in dataPathLs[:1]:
    date = os.path.basename(data)
    readPath = '/mnt/e/unzip_data/2020/SH/' + date + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,41,42,49])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    assert(SH[SH['time'] >= 150500000000].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
    m_ax = SH[SH['time'] >= 150500000000].groupby('skey').first()['time'].max()
    SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= m_ax)]

#     SH["ordering"] = SH.groupby("skey").cumcount()
#     SH["ordering"] = SH["ordering"] + 1
    
#     assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
#     assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
#     SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
#     SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
#     assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
#     assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
#     assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
#     for cols in ['open', 'high', 'prev_close', 'low', 'close']:
#         SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

#     SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
#              "prev_close", "open", "high", "low", "close"]]
    
#     assert(sum(SH['time']%1000000) == 0)
            
#     display(SH["date"].iloc[0])
#     print("index finished")
    
#     database_name = 'com_md_eq_cn'
#     user = "zhenyuy"
#     password = "bnONBrzSMGoE"

#     db1 = DB("192.168.10.178", database_name, user, password)
#     db1.write('md_index', SH)
    
#     del SH

# print(less)

In [14]:
SH[SH['time'] >= 150500000000].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False)

Unnamed: 0,cum_volume,open,high,prev_close,low,close,cum_amount,skey,date,time,clockAtArrival,datetime


In [16]:
SH[SH['time'] >= 150500000000].groupby('skey').first()['time'].max()

150500710000

In [10]:
sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min())

0

In [6]:
SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()

skey
1000016    92503000000
1000300    92503000000
1000852    92503000000
1000905    92503000000
Name: time, dtype: int64

In [5]:
SH[SH['cum_volume'] == 0].groupby('skey')['time'].max()

skey
1000016    92503000000
1000300    92503000000
1000852    92503000000
1000905    92503000000
Name: time, dtype: int64

In [8]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2020"
startDate = '20200701'
endDate = '20200731'
readPath = '/mnt/Kevin_zhenyu/KR_daily_data' + '/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i) for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in np.sort(dataPathLs):
    readPath = data + '/SH/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,41,42,49])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["QuotTime"].iloc[0]//1000000000)
    SH["time"] = (SH['QuotTime'] - int(SH['QuotTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["QuotTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["QuotTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')


    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)

20200701

index finished


20200702

index finished


20200703

index finished


20200706

index finished


20200707

index finished


20200708

index finished


20200709

index finished


20200710

index finished


20200713

index finished


20200714

index finished


20200715

index finished


20200716

index finished


20200717

index finished


20200720

index finished


20200721

index finished


20200722

index finished


20200723

index finished


20200724

index finished


20200727

index finished


20200728

index finished


20200729

index finished


20200730

index finished


20200731

index finished
[]


In [10]:
db1.read('md_index', 20200731, 20200731)

Unnamed: 0,skey,date,time,clockAtArrival,datetime,ordering,cum_volume,cum_amount,prev_close,open,high,low,close
0,1000016,20200731,84442000000,1596156282000000,2020-07-31 08:44:42.000,1,0,0.000000e+00,3231.5193,0.0000,0.0000,0.000,3231.5193
1,1000016,20200731,84542000000,1596156342000000,2020-07-31 08:45:42.000,2,0,0.000000e+00,3231.5193,0.0000,0.0000,0.000,3231.5193
2,1000016,20200731,84542000000,1596156342000000,2020-07-31 08:45:42.000,3,0,0.000000e+00,3231.5193,0.0000,0.0000,0.000,3231.5193
3,1000016,20200731,84642000000,1596156402000000,2020-07-31 08:46:42.000,4,0,0.000000e+00,3231.5193,0.0000,0.0000,0.000,3231.5193
4,1000016,20200731,84642000000,1596156402000000,2020-07-31 08:46:42.000,5,0,0.000000e+00,3231.5193,0.0000,0.0000,0.000,3231.5193
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14247,1000905,20200731,150215270000,1596178935270000,2020-07-31 15:02:15.270,3566,179912941,2.078625e+11,6511.4360,6499.3939,6619.2616,6463.415,6579.6696
14248,1000905,20200731,150215270000,1596178935270000,2020-07-31 15:02:15.270,3567,179912941,2.078625e+11,6511.4360,6499.3939,6619.2616,6463.415,6579.6696
14249,1000905,20200731,150215270000,1596178935270000,2020-07-31 15:02:15.270,3568,179912941,2.078625e+11,6511.4360,6499.3939,6619.2616,6463.415,6579.6696
14250,1000905,20200731,150215270000,1596178935270000,2020-07-31 15:02:15.270,3569,179912941,2.078625e+11,6511.4360,6499.3939,6619.2616,6463.415,6579.6696
