In [9]:
import pymongo
import pandas as pd
import numpy as np
import pickle
import datetime
import time
import gzip
import lzma
import pytz

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    url = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    client = pymongo.MongoClient(url, maxPoolSize=None)
    db = client[db_name]
    return db

def build_query(start_date=None, end_date=None, index_id=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if index_id:
        if type(index_id) == list or type(index_id) == tuple:
            query['index_id'] = {'$in': [parse_symbol(x) for x in index_id]}
        else:
            query['index_id'] = parse_symbol(index_id)
    
    return query

def build_filter_query(start_date=None, end_date=None, skey=None):
    query = {}

    def parse_date(x):
        if type(x) == int:
            return x
        elif type(x) == str:
            if len(x) != 8:
                raise Exception("`date` must be YYYYMMDD format")
            return int(x)
        elif type(x) == datetime.datetime or type(x) == datetime.date:
            return x.strftime("%Y%m%d").astype(int)
        else:
            raise Exception("invalid `date` type: " + str(type(x)))

    if start_date is not None or end_date is not None:
        query['date'] = {}
        if start_date is not None:
            query['date']['$gte'] = parse_date(start_date)
        if end_date is not None:
            query['date']['$lte'] = parse_date(end_date)

    def parse_symbol(x):
        if type(x) == int:
            return x
        else:
            return int(x)

    if skey:
        if type(skey) == list or type(skey) == tuple:
            query['skey'] = {'$in': [parse_symbol(x) for x in skey]}
        else:
            query['skey'] = parse_symbol(skey)
    
    return query

def write_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['index_id'].unique():
        if symbol in collection.distinct('index_id'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'index_id':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['index_id'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['index_id'] == symbol)]
            print(df2)
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 
    
def write_filter_data(db, name, df):
    collection = db[name]
    df1 = []
    for symbol in df['skey'].unique():
        if symbol in collection.distinct('skey'):
            symbol = int(symbol)
            m_ax = pd.DataFrame.from_records(collection.find({'skey':{'$in':[symbol]}}).sort([('date',-1)]).skip(0).limit(1))['date'].values[0]
            df2 = df[(df['skey'] == symbol) & (df['date'] > m_ax)]
            print(df2)
            df1 += [df2]
        else:
            print(symbol)
            df2 = df[(df['skey'] == symbol)]
            df1 += [df2]
    df1 = pd.concat(df1).reset_index(drop=True)
    df1 = df1.to_dict('records')
    collection.insert_many(df1) 

def delete_filter_data(db, name, start_date=None, end_date=None, skey=None):
    collection = db[name]
    query = build_filter_query(start_date, end_date, skey)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
    
def write_weight_data(db, name, df, index_id):
    collection = db[name]
    df = df[df['index_id'] == index_id]
    for (date, skey), sub_df in df.groupby(['date', 'skey']):
        date = int(date)
        skey = int(skey)
        weight = sub_df['weight'].values[0]
        print(weight)
        collection.update({'skey': skey, 'date':date, 'index_id':index_id}, {'$set':{'weight':float(weight)}})

def delete_data(db, name, start_date=None, end_date=None, index_id=None):
    collection = db[name]
    query = build_query(start_date, end_date, index_id)
    if not query:
        print('cannot delete the whole table')
        return None
    collection.delete_many(query)    
 
def read_daily(db, name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, col=None, return_sdi=True):
    collection = db[name]
    # Build projection
    prj = {'_id': 0}
    if col is not None:
        if return_sdi:
            col = ['skey', 'date', 'interval'] + col
        for col_name in col:
            prj[col_name] = 1

    # Build query
    query = {}
    if skey is not None:
        query['skey'] = {'$in': skey}
    if index_id is not None:
        query['index_id'] = {'$in': index_id}
    if interval is not None:
        query['interval'] = {'$in': interval}
    if start_date is not None:
        if end_date is not None:
            query['date'] = {'$gte': start_date, '$lte': end_date}
        else:
            query['date'] = {'$gte': start_date}
    elif end_date is not None:
        query['date'] = {'$lte': end_date}

    # Load data
    cur = collection.find(query, prj)
    df = pd.DataFrame.from_records(cur)
    if df.empty:
        df = pd.DataFrame()
    else:
        df = df.sort_values(by=['date', 'index_id', 'skey'])
    return df    




database_name = 'com_md_eq_cn'
user = "zhenyuy"
password = "bnONBrzSMGoE"

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

In [27]:
import os
import glob
import datetime
import numpy as np
import pandas as pd

readPath = '/mnt/e/filter/***'
dataPathLs = np.array(glob.glob(readPath))
for i in np.sort(dataPathLs)[2:]:
    filter_data = pd.read_pickle(i)
    if 20200924 in filter_data['date'].values:
        filter_data = filter_data.append(pd.DataFrame([[filter_data['skey'].iloc[0], 20200925, 
                                    filter_data.loc[filter_data['date'] == 20200924, 'amountFilter'].values[0]]], 
                                    columns = ['skey', 'date', 'amountFilter']))
    filter_data['amountFilter'] = filter_data['amountFilter'].fillna(0)
    filter_data = filter_data.rename(columns={'amountFilter':"size_filter"})
    filter_data = filter_data[(filter_data['date'] >= 20180101)].sort_values(by='date').reset_index(drop=True)
    if filter_data.empty:
        continue
    else:
        write_filter_data(db1, 'md_stock_sizefilter', filter_data)

1600006
1600007
1600008
1600009
1600010
1600011
1600012
1600015
1600016
1600017
1600018
1600019
1600020
1600021
1600022
1600023
1600025
1600026
1600027
1600028
1600029
1600030
1600031
1600033
1600035
1600036
1600037
1600038
1600039
1600048
1600050
1600051
1600052
1600053
1600054
1600055
1600056
1600057
1600058
1600059
1600060
1600061
1600062
1600063
1600064
1600066
1600067
1600068
1600069
1600070
1600071
1600072
1600073
1600074
1600075
1600076
1600077
1600078
1600079
1600080
1600081
1600082
1600083
1600084
1600085
1600086
1600088
1600089
1600090
1600091
1600093
1600094
1600095
1600096
1600097
1600098
1600099
1600100
1600101
1600103
1600104
1600105
1600106
1600107
1600108
1600109
1600110
1600111
1600112
1600113
1600114
1600115
1600116
1600117
1600118
1600119
1600120
1600121
1600122
1600123
1600125
1600126
1600127
1600128
1600129
1600130
1600131
1600132
1600133
1600135
1600136
1600137
1600138
1600139
1600141
1600143
1600145
1600146
1600148
1600149
1600150
1600151
1600152
1600153
1600155


1603007
1603008
1603009
1603010
1603011
1603012
1603013
1603015
1603016
1603017
1603018
1603019
1603020
1603021
1603022
1603023
1603025
1603026
1603027
1603028
1603029
1603030
1603031
1603032
1603033
1603035
1603036
1603037
1603038
1603039
1603040
1603041
1603042
1603043
1603045
1603050
1603053
1603055
1603056
1603058
1603059
1603060
1603063
1603066
1603067
1603068
1603069
1603076
1603077
1603078
1603079
1603080
1603081
1603083
1603085
1603086
1603087
1603088
1603089
1603090
1603093
1603095
1603096
1603098
1603099
1603100
1603101
1603103
1603105
1603106
1603108
1603109
1603110
1603111
1603112
1603113
1603115
1603116
1603117
1603118
1603121
1603123
1603126
1603127
1603128
1603129
1603131
1603133
1603136
1603138
1603139
1603155
1603156
1603157
1603158
1603159
1603160
1603161
1603165
1603166
1603167
1603168
1603169
1603177
1603178
1603179
1603180
1603181
1603183
1603185
1603186
1603187
1603188
1603189
1603192
1603195
1603196
1603197
1603198
1603199
1603200
1603203
1603208
1603212
1603214


2000790
2000791
2000792
2000793
2000795
2000796
2000797
2000798
2000799
2000800
2000801
2000802
2000803
2000806
2000807
2000809
2000810
2000811
2000812
2000813
2000815
2000816
2000818
2000819
2000820
2000821
2000822
2000823
2000825
2000826
2000828
2000829
2000830
2000831
2000833
2000835
2000836
2000837
2000838
2000839
2000848
2000850
2000851
2000852
2000856
2000858
2000859
2000860
2000861
2000862
2000863
2000868
2000869
2000875
2000876
2000877
2000878
2000880
2000881
2000882
2000883
2000885
2000886
2000887
2000888
2000889
2000890
2000892
2000893
2000895
2000897
2000898
2000899
2000900
2000901
2000902
2000903
2000905
2000906
2000908
2000909
2000910
2000911
2000912
2000913
2000915
2000917
2000918
2000919
2000920
2000921
2000922
2000923
2000925
2000926
2000927
2000928
2000929
2000930
2000931
2000932
2000933
2000935
2000936
2000937
2000938
2000939
2000948
2000949
2000950
2000951
2000952
2000953
2000955
2000957
2000958
2000959
2000960
2000961
2000962
2000963
2000965
2000966
2000967
2000968


2002886
2002887
2002888
2002889
2002890
2002891
2002892
2002893
2002895
2002896
2002897
2002898
2002899
2002900
2002901
2002902
2002903
2002905
2002906
2002907
2002908
2002909
2002910
2002911
2002912
2002913
2002915
2002916
2002917
2002918
2002919
2002920
2002921
2002922
2002923
2002925
2002926
2002927
2002928
2002929
2002930
2002931
2002932
2002933
2002935
2002936
2002937
2002938
2002939
2002940
2002941
2002942
2002943
2002945
2002946
2002947
2002948
2002949
2002950
2002951
2002952
2002953
2002955
2002956
2002957
2002958
2002959
2002960
2002961
2002962
2002963
2002965
2002966
2002967
2002968
2002969
2002970
2002971
2002972
2002973
2002975
2002976
2002977
2002978
2002979
2002980
2002981
2002982
2002983
2002984
2002985
2002986
2002987
2002988
2002989
2002990
2002991
2002992
2002993
2002995
2002996
2002997
2002999
2003000
2003002
2003003
2003005
2003006
2003007
2003008
2003816
2300001
2300002
2300003
2300004
2300005
2300006
2300007
2300008
2300009
2300010
2300011
2300012
2300013
2300014


In [33]:
delete_filter_data(db1, 'md_stock_sizefilter', skey=2002885)

In [34]:
import os
import glob
import datetime
import numpy as np
import pandas as pd


filter_data = pd.read_pickle('/mnt/e/filter/2002885.pkl')
if 20200924 in filter_data['date'].values:
    filter_data = filter_data.append(pd.DataFrame([[filter_data['skey'].iloc[0], 20200925, 
                                filter_data.loc[filter_data['date'] == 20200924, 'amountFilter'].values[0]]], 
                                columns = ['skey', 'date', 'amountFilter']))
filter_data['amountFilter'] = filter_data['amountFilter'].fillna(0)
filter_data = filter_data.rename(columns={'amountFilter':"size_filter"})
filter_data = filter_data[(filter_data['date'] >= 20180101)].sort_values(by='date').reset_index(drop=True)
if filter_data.empty:
    print('wrong')
else:
    write_filter_data(db1, 'md_stock_sizefilter', filter_data)

2002885


In [None]:
delete_filter_data(db1, 'md_stock_sizefilter', skey=2000789)

In [None]:
delete_filter_data(db1, 'md_stock_sizefilter', skey=2002885)