In [45]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2017"
startDate = '20170701'
endDate = '20171231'
readPath = '/mnt/usb/data/' + year + '/***/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    if len(np.array(glob.glob(data + '/SH/***'))) == 0:
        if int(os.path.basename(data)) not in date_list["Date"].values:
            continue
        else:
            print(os.path.basename(data) + " less data!!!!!!!!!!!!!!!!!")
            less.append(data)
            continue
    startTm = datetime.datetime.now()
    date = os.path.basename(data)
    rar_path = data + '/SH/snapshot.7z'
    path = '/mnt/e/unzip_data/2017/SH'
    path1 = path + '/' + date
    un_path = path1
    cmd = '7za x {} -o{}'.format(rar_path, un_path)
    os.system(cmd)
    print(datetime.datetime.now() - startTm)
    print(date + ' unzip finished')
    
    readPath = path1 + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
    try:
        assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
               (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
        SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]
    except:
        try:
            m_ax = SH[SH['time'] <= 150700000000].groupby('skey').last()['time'].min()
            assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                                   'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
                   (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
            SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150700000000)]
        except:
            m_ax = SH[SH['time'] <= 150800000000].groupby('skey').last()['time'].min()
            assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                                   'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
                   (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
            SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150800000000)]

    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]    
    m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
    try:
        assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
        SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    except:
        print(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].unique())
        tt = SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].last().unique().max()
        if tt < 121500000000:
            m_in = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= m_in], SH[SH['time'] >= 125500000000]])
        else:
            m_ax = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= m_ax]])
    
    SH = SH.sort_values(by=['skey', 'time', 'cum_volume'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)

0:00:00.795793
20170703 unzip finished


20170703

index finished
0:00:00.440587
20170704 unzip finished


20170704

index finished
0:00:00.577588
20170705 unzip finished


20170705

index finished
0:00:00.671740
20170706 unzip finished


20170706

index finished
0:00:00.499465
20170707 unzip finished


20170707

index finished
0:00:00.469788
20170710 unzip finished


20170710

index finished
0:00:00.473170
20170711 unzip finished


20170711

index finished
0:00:00.447462
20170712 unzip finished


20170712

index finished
0:00:00.527044
20170713 unzip finished


20170713

index finished
0:00:00.445162
20170714 unzip finished


20170714

index finished
0:00:00.520915
20170717 unzip finished


20170717

index finished
0:00:00.448121
20170718 unzip finished


20170718

index finished
0:00:00.505611
20170719 unzip finished


20170719

index finished
0:00:00.446058
20170720 unzip finished


20170720

index finished
0:00:00.446710
20170721 unzip finished


20170721

index finished
0:00:00.467083
20170724 unzip finished


20170724

index finished
0:00:00.447394
20170725 unzip finished


20170725

index finished
0:00:00.405245
20170726 unzip finished


20170726

index finished
0:00:00.453862
20170727 unzip finished


20170727

index finished
0:00:00.471909
20170728 unzip finished


20170728

index finished
0:00:00.416198
20170731 unzip finished


20170731

index finished
0:00:00.420439
20170801 unzip finished


20170801

index finished
0:00:00.515847
20170802 unzip finished


20170802

index finished
0:00:00.468680
20170803 unzip finished


20170803

index finished
0:00:00.463189
20170804 unzip finished


20170804

index finished
0:00:00.425533
20170807 unzip finished


20170807

index finished
0:00:00.455176
20170808 unzip finished


20170808

index finished
0:00:00.536976
20170809 unzip finished


20170809

index finished
0:00:00.641556
20170810 unzip finished


20170810

index finished
0:00:00.497253
20170811 unzip finished


20170811

index finished
0:00:00.484043
20170814 unzip finished


20170814

index finished
0:00:00.375625
20170815 unzip finished


20170815

index finished
0:00:00.459418
20170816 unzip finished


20170816

index finished
0:00:00.441862
20170817 unzip finished


20170817

index finished
0:00:00.379514
20170818 unzip finished


20170818

index finished
0:00:00.537985
20170821 unzip finished


20170821

index finished
0:00:00.450398
20170822 unzip finished


20170822

index finished
0:00:00.451661
20170823 unzip finished


20170823

index finished
0:00:00.475559
20170824 unzip finished


20170824

index finished
0:00:00.504323
20170825 unzip finished


20170825

index finished
0:00:00.459112
20170828 unzip finished


20170828

index finished
0:00:00.405091
20170829 unzip finished


20170829

index finished
0:00:00.564319
20170830 unzip finished


20170830

index finished
0:00:31.888292
20170831 unzip finished


20170831

index finished
0:00:37.163149
20170901 unzip finished


20170901

index finished
0:00:40.851953
20170904 unzip finished


20170904

index finished
0:00:58.779334
20170905 unzip finished


20170905

index finished
0:00:37.054955
20170906 unzip finished


20170906

index finished
0:00:47.607916
20170907 unzip finished


20170907

index finished
0:00:33.723518
20170908 unzip finished


20170908

index finished
0:00:38.936183
20170911 unzip finished


20170911

index finished
0:00:36.031543
20170912 unzip finished


20170912

index finished
0:00:41.154837
20170913 unzip finished


20170913

index finished
0:00:46.492666
20170914 unzip finished


20170914

index finished
0:00:46.248767
20170915 unzip finished


20170915

index finished
0:00:34.554084
20170918 unzip finished


20170918

index finished
0:01:01.560818
20170919 unzip finished


20170919

index finished
0:00:30.457590
20170920 unzip finished


20170920

index finished
0:00:37.055489
20170921 unzip finished


20170921

index finished
0:00:33.877431
20170922 unzip finished


20170922

index finished
0:00:52.362001
20170925 unzip finished


20170925

index finished
0:00:28.818662
20170926 unzip finished


20170926

index finished
0:01:12.434244
20170927 unzip finished


20170927

index finished
0:00:28.324583
20170928 unzip finished


20170928

index finished
0:00:28.164341
20170929 unzip finished


20170929

index finished
0:00:33.085710
20171009 unzip finished


20171009

index finished
0:00:30.876367
20171010 unzip finished


20171010

index finished
0:00:31.881835
20171011 unzip finished


20171011

index finished
0:00:37.098711
20171012 unzip finished


20171012

index finished
0:00:29.505501
20171013 unzip finished


20171013

index finished
0:00:57.452948
20171016 unzip finished


20171016

index finished
0:00:27.795811
20171017 unzip finished


20171017

index finished
0:00:28.330689
20171018 unzip finished


20171018

index finished
0:00:29.239611
20171019 unzip finished


20171019

index finished
0:00:27.654685
20171020 unzip finished


20171020

index finished
0:00:28.976331
20171023 unzip finished


20171023

index finished
0:00:27.807250
20171024 unzip finished


20171024

index finished
0:00:34.755683
20171025 unzip finished


20171025

index finished
0:00:41.603727
20171026 unzip finished


20171026

index finished
0:00:29.179518
20171027 unzip finished


20171027

index finished
0:00:31.977728
20171030 unzip finished


20171030

index finished
0:01:19.414590
20171031 unzip finished


20171031

index finished
0:00:30.159189
20171101 unzip finished


20171101

index finished
0:00:34.801995
20171102 unzip finished


20171102

index finished
0:00:31.262375
20171103 unzip finished


20171103

index finished
0:01:27.137535
20171106 unzip finished


20171106

index finished
0:00:31.548535
20171107 unzip finished


20171107

index finished
0:00:31.817929
20171108 unzip finished


20171108

index finished
0:00:29.460580
20171109 unzip finished


20171109

index finished
0:00:30.902363
20171110 unzip finished


20171110

index finished
0:00:32.033861
20171113 unzip finished


20171113

index finished
0:00:31.990146
20171114 unzip finished


20171114

index finished
0:00:29.776391
20171115 unzip finished


20171115

index finished
0:00:29.708390
20171116 unzip finished


20171116

index finished
0:00:48.058141
20171117 unzip finished


20171117

index finished
0:00:29.621061
20171120 unzip finished


20171120

index finished
0:01:05.777722
20171121 unzip finished


20171121

index finished
0:00:29.333412
20171122 unzip finished


20171122

index finished
0:00:32.404395
20171123 unzip finished


20171123

index finished
0:00:27.214970
20171124 unzip finished


20171124

index finished
0:00:26.713250
20171127 unzip finished


20171127

index finished
0:00:30.285878
20171128 unzip finished


20171128

index finished
0:01:04.211013
20171129 unzip finished


20171129

index finished
0:00:27.634515
20171130 unzip finished


20171130

index finished
0:00:42.541724
20171201 unzip finished


20171201

index finished
0:00:39.778337
20171204 unzip finished


20171204

index finished
0:00:29.967548
20171205 unzip finished


20171205

index finished
0:00:29.114175
20171206 unzip finished


20171206

index finished
0:01:06.841176
20171207 unzip finished


20171207

index finished
0:00:32.279224
20171208 unzip finished


20171208

index finished
0:00:40.416344
20171211 unzip finished


20171211

index finished
0:00:27.892642
20171212 unzip finished


20171212

index finished
0:00:26.388485
20171213 unzip finished


20171213

index finished
0:00:38.613131
20171214 unzip finished


20171214

index finished
0:00:29.939458
20171215 unzip finished


20171215

index finished
0:00:28.201285
20171218 unzip finished


20171218

index finished
0:00:27.803724
20171219 unzip finished


20171219

index finished
0:00:29.525416
20171220 unzip finished


20171220

index finished
0:01:43.341931
20171221 unzip finished


20171221

index finished
0:00:25.933804
20171222 unzip finished


20171222

index finished
0:00:29.540527
20171225 unzip finished


20171225

index finished
0:00:29.639859
20171226 unzip finished


20171226

index finished
0:00:29.218963
20171227 unzip finished


20171227

index finished
0:00:30.456967
20171228 unzip finished


20171228

index finished
0:00:27.841558
20171229 unzip finished


20171229

index finished
[]


In [46]:
import pymongo
import pandas as pd
import pickle
import datetime
import time
import gzip
import lzma
import pytz


def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)


class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db'):
        self.db_name = db_name
        self.uri = uri
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.db_name]
        self.chunk_size = 20000
        self.symbol_column = symbol_column
        self.date_column = 'date'

    def parse_uri(self, uri):
        # mongodb://user:password@example.com
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def drop_table(self, table_name):
        self.db.drop_collection(table_name)

    def rename_table(self, old_table, new_table):
        self.db[old_table].rename(new_table)

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = 1
            seg = {'ver': version, 'data': self.ser(df_seg, version), 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None

        collection.delete_many(query)

    def read(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]

        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot read the whole table')
            return None

        segs = []
        for x in collection.find(query):
            x['data'] = self.deser(x['data'], x['ver'])
            segs.append(x)
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start']))
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        def unpickle(s):
            return pickle.loads(s)

        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        else:
            raise Exception('unknown version')


def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()











import pandas as pd
import random
import numpy as np
import glob
import pickle
import os
import datetime
import time
pd.set_option("max_columns", 200)


year = "2017"
startDate = '20170630'
endDate = '20170630'
readPath = '/mnt/ShareWithServer/data/***'
dataPathLs = np.array(glob.glob(readPath))
dateLs = np.array([os.path.basename(i).split('_')[0] for i in dataPathLs])
dataPathLs = dataPathLs[(dateLs >= startDate) & (dateLs <= endDate)]
date_list = pd.read_csv("/home/work516/KR_upload_code/trading_days.csv")
wr_ong = []
mi_ss = []
less = []

for data in dataPathLs:
    readPath = data + '/snapshot/***2/***'
    dataPathLs = np.array(glob.glob(readPath))
    dateLs = np.array([int(os.path.basename(i).split('.')[0]) for i in dataPathLs])
    dataPathLs = dataPathLs[(dateLs == 16) | (dateLs == 300) | (dateLs == 852) | (dateLs == 905)]
    SH = []
    ll = []
    for i in dataPathLs:
        try:
            df = pd.read_csv(i, usecols = [17,19,20,21,22,34,41,42])
        except:
            print("empty data")
            print(i)
            ll.append(int(os.path.basename(i).split('.')[0]))
            continue
        df["StockID"] = int(os.path.basename(i).split('.')[0])
        SH += [df]
    del df
    SH = pd.concat(SH).reset_index(drop=True)
    
    SH["skey"] = SH["StockID"] + 1000000
    SH.drop(["StockID"],axis=1,inplace=True)
    SH["date"] = int(SH["SendingTime"].iloc[0]//1000000000)
    SH["time"] = (SH['SendingTime'] - int(SH['SendingTime'].iloc[0]//1000000000*1000000000)).astype(np.int64) * 1000
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S%f').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['datetime'] = SH["clockAtArrival"].apply(lambda x: datetime.datetime.fromtimestamp(x/1e6))

    SH.columns = ['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime']
    SH = SH.fillna(0)
    SH = SH.drop_duplicates(['cum_volume', 'open','high', 'prev_close', 'low', 'close', 'cum_amount', 'skey', 
                  'date', 'time', 'clockAtArrival', 'datetime'])
    assert(sum(SH['time']%1000000) == 0)
    assert(sum(SH[SH['cum_volume'] == 0].groupby('skey')['time'].max() 
               < SH[SH['cum_volume'] > 0].groupby('skey')['time'].min()))
    m_ax = SH[SH['time'] <= 150500000000].groupby('skey').last()['time'].min()
    try:
        assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
               (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
        SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150500000000)]
    except:
        try:
            m_ax = SH[SH['time'] <= 150700000000].groupby('skey').last()['time'].min()
            assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                                   'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
                   (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
            SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150700000000)]
        except:
            m_ax = SH[SH['time'] <= 150800000000].groupby('skey').last()['time'].min()
            assert((SH[SH['time'] >= m_ax].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                                   'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0) & \
                   (sum(SH[SH['time'] >= m_ax].groupby('skey')['close'].nunique() != 1) == 0))
            SH = SH[(SH['cum_volume'] > 0) & (SH['time'] <= 150800000000)]

    
    k1 = SH.groupby('skey')['datetime'].min().reset_index()
    k1 = k1.rename(columns={'datetime':'min'})
    k2 = SH.groupby('skey')['datetime'].max().reset_index()
    k2 = k2.rename(columns={'datetime':'max'})
    k = pd.merge(k1, k2, on='skey')
    k['diff'] = (k['max']-k['min']).apply(lambda x: x.seconds)
    df = pd.DataFrame()
    for i in np.arange(k.shape[0]):
        df1 = pd.DataFrame()
        df1['datetime1'] = [k.loc[i, 'min'] + datetime.timedelta(seconds=int(x)) for x in np.arange(0, k.loc[i, 'diff'] + 1)]
        df1['skey'] = k.loc[i, 'skey']
        assert(df1['datetime1'].min() == k.loc[i, 'min'])
        assert(df1['datetime1'].max() == k.loc[i, 'max'])
        df = pd.concat([df, df1])
    
    SH = pd.merge(SH, df, left_on=['skey', 'datetime'], right_on=['skey', 'datetime1'], how='outer').sort_values(by=['skey', 'datetime1']).reset_index(drop=True)
    assert(SH[SH['datetime1'].isnull()].shape[0] == 0)
    for cols in ['date', 'cum_volume', 'cum_amount', 'prev_close', 'open', 'high', 'low', 'close']:
        SH[cols] = SH.groupby('skey')[cols].ffill()
    SH.drop(["datetime"],axis=1,inplace=True)
    SH = SH.rename(columns={'datetime1':'datetime'})
    SH['date'] = SH['date'].iloc[0]
    SH['date'] = SH['date'].astype('int32')
    SH['skey'] = SH['skey'].astype('int32')
    SH["time"] = SH['datetime'].astype(str).apply(lambda x: int(x.split(' ')[1].replace(':', ""))).astype(np.int64)
    SH['SendingTime'] = SH['date'] * 1000000 + SH['time']
    SH["clockAtArrival"] = SH["SendingTime"].astype(str).apply(lambda x: np.int64(datetime.datetime.strptime(x, '%Y%m%d%H%M%S').timestamp()*1e6))
    SH.drop(["SendingTime"],axis=1,inplace=True)
    SH['time'] = SH['time'] * 1000000
    
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    SH["prev_close"] = np.where(SH["time"] >= 91500000000, SH.groupby("skey")["prev_close"].transform("max"), SH["prev_close"]) 
    SH["open"] = np.where(SH["cum_volume"] > 0, SH.groupby("skey")["open"].transform("max"), SH["open"])
    assert(sum(SH[SH["open"] != 0].groupby("skey")["open"].nunique() != 1) == 0)
    assert(sum(SH[SH["prev_close"] != 0].groupby("skey")["prev_close"].nunique() != 1) == 0)
    assert(SH[SH["cum_volume"] > 0]["open"].min() > 0)
    
    for cols in ['open', 'high', 'prev_close', 'low', 'close']:
        SH[cols] = SH[cols].apply(lambda x: round(x, 4)).astype('float64')

    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "cum_volume", "cum_amount", 
             "prev_close", "open", "high", "low", "close"]]    
    m_in = SH[SH['time'] <= 113500000000].groupby('skey').last()['time'].min()
    m_ax = SH[SH['time'] >= 125500000000].groupby('skey').first()['time'].max()
    try:
        assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
        SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= 125500000000]])
    except:
        print(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].unique())
        tt = SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey')['time'].last().unique().max()
        if tt < 121500000000:
            m_in = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= m_in], SH[SH['time'] >= 125500000000]])
        else:
            m_ax = tt
            assert((SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep=False).shape[0] == 0)
          & (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['cum_volume'].nunique() != 1) == 0) & 
           (sum(SH[(SH['time'] >= m_in) & (SH['time'] < m_ax)].groupby('skey')['close'].nunique() != 1) == 0))
            SH = pd.concat([SH[SH['time'] <= 113500000000], SH[SH['time'] >= m_ax]])
    
    SH = SH.sort_values(by=['skey', 'time', 'cum_volume'])
    SH["ordering"] = SH.groupby("skey").cumcount()
    SH["ordering"] = SH["ordering"] + 1
    SH['ordering'] = SH['ordering'].astype('int32')
    SH['cum_volume'] = SH['cum_volume'].astype('int64')
    
    SH = SH[["skey", "date", "time", "clockAtArrival", "datetime", "ordering", "cum_volume", "cum_amount", 
             "open", "close"]]
            
    display(SH["date"].iloc[0])
    print("index finished")
    
    database_name = 'com_md_eq_cn'
    user = "zhenyuy"
    password = "bnONBrzSMGoE"

    db1 = DB("192.168.10.178", database_name, user, password)
    db1.write('md_index', SH)
    
    del SH

print(less)

In [48]:
SH[(SH['time'] >= m_in) & (SH['time'] <= m_ax)].drop_duplicates(['cum_volume', 'open', 'high', 'low', 'prev_close', 
                                               'close', 'cum_amount', 'skey', 'date'], keep='first').groupby('skey').last()

Unnamed: 0_level_0,date,time,clockAtArrival,datetime,cum_volume,cum_amount,prev_close,open,high,low,close
skey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000016,20170630,113624000000,1498793784000000,2017-06-30 11:36:24,10180068.0,13409490000.0,2552.9754,2543.7865,2548.1345,2533.8595,2542.5826
1000300,20170630,113624000000,1498793784000000,2017-06-30 11:36:24,45001697.0,57566430000.0,3668.8279,3654.7348,3660.0172,3646.2272,3658.4181
1000852,20170630,113624000000,1498793784000000,2017-06-30 11:36:24,34000656.0,38770800000.0,7456.6719,7435.7002,7443.7011,7416.4484,7440.0605
1000905,20170630,113624000000,1498793784000000,2017-06-30 11:36:24,28273982.0,35156480000.0,6116.9932,6101.3648,6110.7696,6090.2523,6110.0707
