In [72]:
import pymongo 
import io 
import pandas as pd 
import pickle 
import datetime 
import time 
import gzip 
import lzma 
import pytz 
import pyarrow as pa 
import pyarrow.parquet as pq 
import numpy as np 
import re

def DB(host, db_name, user, passwd):
    auth_db = db_name if user not in ('admin', 'root') else 'admin'
    uri = 'mongodb://%s:%s@%s/?authSource=%s' % (user, passwd, host, auth_db)
    return DBObj(uri, db_name=db_name)

class DBObj(object):
    def __init__(self, uri, symbol_column='skey', db_name='white_db', version=3): 
        self.db_name = db_name 
        self.uri = uri 
        self.client = pymongo.MongoClient(self.uri) 
        self.db = self.client[self.db_name] 
        self.chunk_size = 20000 
        self.symbol_column = symbol_column 
        self.date_column = 'date' 
        self.version = version

    def parse_uri(self, uri): 
        # mongodb://user:password@example.com 
        return uri.strip().replace('mongodb://', '').strip('/').replace(':', ' ').replace('@', ' ').split(' ')

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}
        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("date must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid date type: " + str(type(x)))
        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)
        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)
        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)
        return query

    def read_tick(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name] 
        query = self.build_query(start_date, end_date, symbol) 
        if not query: 
            print('cannot read the whole table') 
            return None  
        segs = [] 
        for x in collection.find(query): 
            x['data'] = self.deser(x['data'], x['ver']) 
            segs.append(x) 
        segs.sort(key=lambda x: (x['symbol'], x['date'], x['start'])) 
        return pd.concat([x['data'] for x in segs], ignore_index=True) if segs else None

    def read_daily(self, table_name, start_date=None, end_date=None, skey=None, index_id=None, interval=None, index_name=None, col=None, return_sdi=True): 
        collection = self.db[table_name]
        # Build projection 
        prj = {'_id': 0} 
        if col is not None: 
            if return_sdi: 
                col = ['skey', 'date', 'index_id'] + col 
            for col_name in col: 
                prj[col_name] = 1 
        # Build query 
        query = {} 
        if skey is not None: 
            query['skey'] = {'$in': skey} 
        if interval is not None: 
            query['interval'] = {'$in': interval} 
        if index_id is not None: 
            query['index_id'] = {'$in': index_id}    
        if index_name is not None:
            n = '' 
            for name in index_name: 
                try: 
                    name = re.compile('[\u4e00-\u9fff]+').findall(name)[0] 
                    if len(n) == 0: 
                        n = n = "|".join(name) 
                    else: 
                        n = n + '|' + "|".join(name) 
                except: 
                    if len(n) == 0: 
                        n = name 
                    else: 
                        n = n + '|' + name 
            query['index_name'] = {'$regex': n}
        if start_date is not None: 
            if end_date is not None: 
                query['date'] = {'$gte': start_date, '$lte': end_date} 
            else: 
                query['date'] = {'$gte': start_date} 
        elif end_date is not None: 
            query['date'] = {'$lte': end_date} 
        # Load data 
        cur = collection.find(query, prj) 
        df = pd.DataFrame.from_records(cur) 
        if df.empty: 
            df = pd.DataFrame() 
        else:
            if 'index_id' in df.columns:
                df = df.sort_values(by=['date', 'index_id', 'skey']).reset_index(drop=True)
            else:
                df = df.sort_values(by=['date','skey']).reset_index(drop=True)
        return df 
 

    def write(self, table_name, df):
        if len(df) == 0: return

        multi_date = False

        if self.date_column in df.columns:
            date = str(df.head(1)[self.date_column].iloc[0])
            multi_date = len(df[self.date_column].unique()) > 1
        else:
            raise Exception('DataFrame should contain date column')

        collection = self.db[table_name]
        collection.create_index([('date', pymongo.ASCENDING), ('symbol', pymongo.ASCENDING)], background=True)
        collection.create_index([('symbol', pymongo.ASCENDING), ('date', pymongo.ASCENDING)], background=True)

        if multi_date:
            for (date, symbol), sub_df in df.groupby([self.date_column, self.symbol_column]):
                date = str(date)
                symbol = int(symbol)
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)
        else:
            for symbol, sub_df in df.groupby([self.symbol_column]):
                collection.delete_many({'date': date, 'symbol': symbol})
                self.write_single(collection, date, symbol, sub_df)

    def write_single(self, collection, date, symbol, df):
        for start in range(0, len(df), self.chunk_size):
            end = min(start + self.chunk_size, len(df))
            df_seg = df[start:end]
            version = self.version
            ser_data = self.ser(df_seg, version)
            seg = {'ver': version, 'data': ser_data, 'date': date, 'symbol': symbol, 'start': start}
            collection.insert_one(seg)

    def build_query(self, start_date=None, end_date=None, symbol=None):
        query = {}

        def parse_date(x):
            if type(x) == str:
                if len(x) != 8:
                    raise Exception("`date` must be YYYYMMDD format")
                return x
            elif type(x) == datetime.datetime or type(x) == datetime.date:
                return x.strftime("%Y%m%d")
            elif type(x) == int:
                return parse_date(str(x))
            else:
                raise Exception("invalid `date` type: " + str(type(x)))

        if start_date is not None or end_date is not None:
            query['date'] = {}
            if start_date is not None:
                query['date']['$gte'] = parse_date(start_date)
            if end_date is not None:
                query['date']['$lte'] = parse_date(end_date)

        def parse_symbol(x):
            if type(x) == int:
                return x
            else:
                return int(x)

        if symbol:
            if type(symbol) == list or type(symbol) == tuple:
                query['symbol'] = {'$in': [parse_symbol(x) for x in symbol]}
            else:
                query['symbol'] = parse_symbol(symbol)

        return query

    def delete(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        query = self.build_query(start_date, end_date, symbol)
        if not query:
            print('cannot delete the whole table')
            return None
        collection.delete_many(query)

    def list_tables(self):
        return self.db.collection_names()

    def list_dates(self, table_name, start_date=None, end_date=None, symbol=None):
        collection = self.db[table_name]
        dates = set()
        if start_date is None:
            start_date = '00000000'
        if end_date is None:
            end_date = '99999999'
        for x in collection.find(self.build_query(start_date, end_date, symbol), {"date": 1, '_id': 0}):
            dates.add(x['date'])
        return sorted(list(dates))

    def ser(self, s, version):
        pickle_protocol = 4
        if version == 1:
            return gzip.compress(pickle.dumps(s, protocol=pickle_protocol), compresslevel=2)
        elif version == 2:
            return lzma.compress(pickle.dumps(s, protocol=pickle_protocol), preset=1)
        elif version == 3:
            # 32-bit number needs more space than 64-bit for parquet
            for col_name in s.columns:
                col = s[col_name]
                if col.dtype == np.int32:
                    s[col_name] = s[col_name].astype(np.int64)
                elif col.dtype == np.uint32:
                    s[col_name] = s[col_name].astype(np.uint64)
            tbl = pa.Table.from_pandas(s)
            f = io.BytesIO()
            pq.write_table(tbl, f, use_dictionary=False, compression='ZSTD', compression_level=0)
            f.seek(0)
            data = f.read()
            return data
        else:
            raise Exception('unknown version')

    def deser(self, s, version):
        print(version)
        def unpickle(s):
            return pickle.loads(s)
        if version == 1:
            return unpickle(gzip.decompress(s))
        elif version == 2:
            return unpickle(lzma.decompress(s))
        elif version == 3:
            f = io.BytesIO()
            f.write(s)
            f.seek(0)
            return pq.read_table(f, use_threads=False).to_pandas()
        else:
            raise Exception('unknown version')

def patch_pandas_pickle():
    if pd.__version__ < '0.24':
        import sys
        from types import ModuleType
        from pandas.core.internals import BlockManager
        pkg_name = 'pandas.core.internals.managers'
        if pkg_name not in sys.modules:
            m = ModuleType(pkg_name)
            m.BlockManager = BlockManager
            sys.modules[pkg_name] = m
patch_pandas_pickle()


In [157]:
database_name = 'com_md_eq_cn'
user = 'zhenyuy'
password = 'bnONBrzSMGoE'

import sys

pd.set_option('max_columns', 200)
db1 = DB("192.168.10.178", database_name, user, password)

beta = db1.read_daily('mktbeta', 20201113, 20201228)
beta['Symbol'] = np.where(beta['skey'] > 2000000, (beta['skey'] - 2000000).astype(str), (beta['skey'] - 1000000).astype(str))
beta['Symbol'] = beta['Symbol'].apply(lambda x: x.rjust(6, '0'))
beta['Symbol'] = np.where(beta['Symbol'] < '600000', 'SZ'+ beta['Symbol'], 'SH' + beta['Symbol'])

In [159]:
IF_in = pd.read_csv(r'D:\work\project 4 event study - minute\IF_in_202006.csv')['StockID'].unique()
beta = db1.read_daily('mktbeta', 20200602, 20200605)
beta['Symbol'] = np.where(beta['skey'] > 2000000, (beta['skey'] - 2000000).astype(str), (beta['skey'] - 1000000).astype(str))
beta['Symbol'] = beta['Symbol'].apply(lambda x: x.rjust(6, '0'))
beta['Symbol'] = np.where(beta['Symbol'] < '600000', 'SZ'+ beta['Symbol'], 'SH' + beta['Symbol'])

In [176]:
beta[(beta['Symbol']=='SH600745') & (beta['date'] >= 20200602)]

Unnamed: 0,skey,name,date,time,beta_10d_IF,beta_60d_IF,beta_10d_IC,beta_60d_IC,beta_10d_CSI1000,beta_60d_CSI1000,alpha_10d_IF,alpha_60d_IF,alpha_10d_IC,alpha_60d_IC,alpha_10d_CSI1000,alpha_60d_CSI1000,Symbol
623,1600745,闻泰科技,20200602,213000000000,1.91067,2.0,1.907867,2.0,1.745551,2.0,-0.008914,-0.006603,-0.008258,-0.006331,-0.00962,-0.006235,SH600745
4723,1600745,闻泰科技,20200603,213000000000,1.829412,2.0,1.854297,2.0,1.655357,2.0,-0.003114,-0.004827,-0.0037,-0.004947,-0.004588,-0.004816,SH600745
8823,1600745,闻泰科技,20200604,213000000000,1.63074,2.0,1.582041,2.0,1.384675,2.0,0.002581,-0.005536,0.001044,-0.005071,0.0003,-0.004811,SH600745
12923,1600745,闻泰科技,20200605,213000000000,2.0,2.0,1.766283,2.0,1.517776,2.0,-0.001457,-0.005261,0.000712,-0.00467,0.000412,-0.004008,SH600745


In [173]:
beta[(beta['Symbol'] == 'SH601100') & (beta['date'] == 20200602)][['skey', 'date', 'beta_60d_IF', 'alpha_60d_IF']]

Unnamed: 0,skey,date,beta_60d_IF,alpha_60d_IF
844,1601100,20200602,0.884635,0.003754


In [174]:
(69.7/73.2 - 1) - (3983.5677 / 3971.3402 - 1) * 0.884635

-0.05053794168406369

In [171]:
3983.5677 / 3971.3402 - 1

0.0030789354183256012

In [170]:
np.log(3983.5677) - np.log(3971.3402)

0.003074205203533964

In [166]:
(69.7/73.2 - 1)

-0.04781420765027322

In [167]:
data = db1.read_daily('mdbar1d_tr', 20200602, 20200602)
data[data['skey'] == 1000300]

Unnamed: 0,skey,date,time,name,trade_status,listed_days,open,high,low,close,closeL1,yclose,ztClose,dtClose,dayReturn,volume,amount,buy_volume,sell_volume,TORate,allZT,hasZT,isZT,allDT,hasDT,isDT,isST,VWAP,TWAP,marketValue,marketShares,totalShares,SW1_name,SW1_code,SW2_code,SW3_code,index_name,index_weight,exchange
1,1000300,20200602,180000000000,沪深300,1,4466,3969.9521,3991.7761,3958.9774,3983.5677,3971.3402,3971.3402,4368.47,3574.21,0.003079,14430306000,195563300000.0,94884268.0,49418792.0,0.0,0,0,0,0,0,0,0,1355.226315,3976.024841,0.0,0.0,0.0,0,0,0,0,,0.0,SSE


In [156]:
beta[(beta['Symbol'].isin(IF_in)) & (beta['Date'] == 20200602) & (beta['Date'] <= 20200605)]

Unnamed: 0.1,Unnamed: 0,Date,Symbol,s_returns,m_returns,beta,alpha
1501556,1501556,20200602,SH600745,0.001456,0.003079,2.071309,-0.004921
1973212,1973212,20200602,SH601077,0.012245,0.003079,0.633332,0.010295
1983303,1983303,20200602,SH601100,-0.047814,0.003079,0.889827,-0.050554
2056210,2056210,20200602,SH601231,0.028332,0.003079,1.805481,0.022773
2144765,2144765,20200602,SH601658,0.0,0.003079,0.390528,-0.001202
2193853,2193853,20200602,SH601816,0.001585,0.003079,0.819437,-0.000938
2231228,2231228,20200602,SH601916,0.015424,0.003079,0.455547,0.014022
2460935,2460935,20200602,SH603369,-0.019379,0.003079,1.496644,-0.023987
2546548,2546548,20200602,SH603658,-0.050649,0.003079,1.165782,-0.054239
2831757,2831757,20200602,SZ000066,0.0,0.003079,1.390191,-0.00428


In [149]:
beta = pd.read_csv('E:\\new_beta_300.csv', encoding="utf-8")

In [130]:
IF_in = pd.read_csv(r'E:\IF_in_202012.csv')['StockID'].unique()
IF_out = pd.read_csv(r'E:\IF_out_202012.csv')['StockID'].unique()
IC_in = pd.read_csv(r'E:\IC_in_202012.csv')['StockID'].unique()
IC_out = pd.read_csv(r'E:\IC_out_202012.csv')['StockID'].unique()
CSI1000_in = pd.read_csv(r'E:\CSI1000_in_202012.csv')['StockID'].unique()
CSI1000_out = pd.read_csv(r'E:\CSI1000_out_202012.csv')['StockID'].unique()
CSIRest_in = pd.read_csv(r'E:\CSIRest_in_202012.csv')['StockID'].unique()
CSIRest_out = pd.read_csv(r'E:\CSIRest_out_202012.csv')['StockID'].unique()
IC_in2 = list(set(IF_out) & set(IC_in))
IC_in1 = list(set(IC_in) - set(IC_in2))
IC_out1 = list(set(IC_out) & set(IF_in))
IC_out2 = list(set(IC_out) - set(IC_out1))
CSI1000_in2 = list((set(IF_out) & set(CSI1000_in)) | (set(IC_out) & set(CSI1000_in)))
CSI1000_in1 = list(set(CSI1000_in) - set(CSI1000_in2))
CSI1000_out1 = list((set(CSI1000_out) & set(IF_in)) | (set(CSI1000_out) & set(IC_in)))
CSI1000_out2 = list(set(CSI1000_out) - set(CSI1000_out1))
CSIRest_in2 = list((set(IF_out) & set(CSIRest_in)) | (set(IC_out) & set(CSIRest_in)) | (set(CSI1000_out) & set(CSIRest_in)))
CSIRest_in1 = list(set(CSIRest_in) - set(CSIRest_in2))
CSIRest_out1 = list((set(CSIRest_out) & set(IF_in)) | (set(CSIRest_out) & set(IC_in)) | (set(CSIRest_out) & set(CSI1000_in)))
CSIRest_out2 = list(set(CSIRest_out) - set(CSIRest_out1))


print(len(IF_in))
print(len(IF_out))
print(len(IC_in))
print(len(IC_in1))
print(len(IC_in2))
print(len(IC_out))
print(len(IC_out1))
print(len(IC_out2))
print(len(CSI1000_in))
print(len(CSI1000_in1))
print(len(CSI1000_in2))
print(len(CSI1000_out))
print(len(CSI1000_out1))
print(len(CSI1000_out2))
print(len(CSIRest_in))
print(len(CSIRest_in1))
print(len(CSIRest_in2))
print(len(CSIRest_out))
print(len(CSIRest_out1))
print(len(CSIRest_out2))

26
26
50
32
18
50
4
46
100
83
17
100
16
84
235
114
121
52
51
1


In [135]:
IF_in = pd.read_csv(r'E:\预测沪深调入.csv', encoding='GBK')['Symbol'].unique()
IF_out = pd.read_csv(r'D:\work\project 3 event study\202012\IF_out_202012.csv')['Symbol'].unique()
IC_in = pd.read_csv(r'E:\预测中证调入.csv', encoding='GBK')['Symbol'].unique()
IC_out = pd.read_csv(r'D:\work\project 3 event study\202012\IC_out_202012.csv')['Symbol'].unique()
CSI1000_in = pd.read_csv(r'E:\预测中证1000调入.csv', encoding='GBK')['Symbol'].unique()
CSI1000_out = pd.read_csv(r'D:\work\project 3 event study\202012\CSI1000_out_202012.csv')['Symbol'].unique()
CSIRest_out = pd.read_csv(r'D:\work\project 3 event study\202012\CSIRest_out_202012.csv')['Symbol'].unique()
IC_in2 = list(set(IF_out) & set(IC_in))
IC_in1 = list(set(IC_in) - set(IC_in2))
IC_out1 = list(set(IC_out) & set(IF_in))
IC_out2 = list(set(IC_out) - set(IC_out1))
CSI1000_in2 = list((set(IF_out) & set(CSI1000_in)) | (set(IC_out) & set(CSI1000_in)))
CSI1000_in1 = list(set(CSI1000_in) - set(CSI1000_in2))
CSI1000_out1 = list((set(CSI1000_out) & set(IF_in)) | (set(CSI1000_out) & set(IC_in)))
CSI1000_out2 = list(set(CSI1000_out) - set(CSI1000_out1))
CSIRest_out1 = list((set(CSIRest_out) & set(IF_in)) | (set(CSIRest_out) & set(IC_in)) | (set(CSIRest_out) & set(CSI1000_in)))
CSIRest_out2 = list(set(CSIRest_out) - set(CSIRest_out1))

print(len(IF_in))
print(len(IF_out))
print(len(IC_in))
print(len(IC_in1))
print(len(IC_in2))
print(len(IC_out))
print(len(IC_out1))
print(len(IC_out2))
print(len(CSI1000_in))
print(len(CSI1000_in1))
print(len(CSI1000_in2))
print(len(CSI1000_out))
print(len(CSI1000_out1))
print(len(CSI1000_out2))
print(len(CSIRest_out))
print(len(CSIRest_out1))
print(len(CSIRest_out2))

24
24
50
33
17
50
5
45
100
80
20
100
21
79
67
66
1


In [133]:
CSIRest_out2

['SZ300151', 'SZ002169', 'SZ300577', 'SZ300064']

In [85]:
beta.head()

Unnamed: 0,skey,name,date,time,beta_10d_IF,beta_60d_IF,beta_10d_IC,beta_60d_IC,beta_10d_CSI1000,beta_60d_CSI1000,alpha_10d_IF,alpha_60d_IF,alpha_10d_IC,alpha_60d_IC,alpha_10d_CSI1000,alpha_60d_CSI1000
0,1600000,浦发银行,20201113,213000000000,0.142741,0.397482,0.0,0.27788,0.0,0.202551,0.000681,-0.002243,0.001238,-0.001744,0.001201,-0.001716
1,1600004,白云机场,20201113,213000000000,0.516949,0.479728,0.477872,0.39359,0.440583,0.298983,0.013796,-0.000279,0.013606,0.000348,0.013386,0.000405
2,1600006,东风汽车,20201113,213000000000,1.537743,0.577649,1.603574,0.536416,0.9236,0.355582,-0.002261,0.004161,-0.002044,0.004936,0.000763,0.004974
3,1600007,中国国贸,20201113,213000000000,0.6766,0.628075,0.697921,0.655882,0.555787,0.535541,0.001669,-0.001311,0.001704,-0.000408,0.002379,-0.00028
4,1600008,首创股份,20201113,213000000000,0.327942,0.359055,0.423842,0.354489,0.34787,0.276258,0.004425,-0.001358,0.004025,-0.000857,0.004282,-0.000779


In [136]:
for ll in [IF_in, IF_out]:
    print('IF')
    print(len(ll))
    date = []
    re = []
    df = pd.DataFrame()
    startDate = [20201113, 20201120, 20201130, 20201207, 20201215, 20201222]
    endDate = [20201119, 20201126, 20201204, 20201211, 20201221, 20201228]
    for i in range(len(startDate)):
        sd = startDate[i]
        ed = endDate[i]
        date.append(str(sd) + ' , ' + str(ed))
        print(beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['Symbol'].nunique())
        re.append((beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['alpha_60d_IF'].mean().sum() * 10000).round(2))
    df['date'] = date
    df['re'] = re
    display(df.T)

IF
24
date
20201113    24
20201116    24
20201117    24
20201118    24
20201119    24
Name: Symbol, dtype: int64
date
20201120    24
20201123    24
20201124    24
20201125    24
20201126    24
Name: Symbol, dtype: int64
date
20201130    24
20201201    24
20201202    24
20201203    24
20201204    24
Name: Symbol, dtype: int64
date
20201207    24
20201208    24
20201209    24
20201210    24
20201211    24
Name: Symbol, dtype: int64
date
20201215    24
20201216    24
20201217    24
20201218    24
20201221    24
Name: Symbol, dtype: int64
date
20201222    24
20201223    24
20201224    24
20201225    24
20201228    24
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-66.9,-83.14,-68.99,-10.63,13.7,-6.01


IF
24
date
20201113    24
20201116    24
20201117    24
20201118    24
20201119    24
Name: Symbol, dtype: int64
date
20201120    24
20201123    24
20201124    24
20201125    24
20201126    24
Name: Symbol, dtype: int64
date
20201130    24
20201201    24
20201202    24
20201203    24
20201204    24
Name: Symbol, dtype: int64
date
20201207    24
20201208    24
20201209    24
20201210    24
20201211    24
Name: Symbol, dtype: int64
date
20201215    24
20201216    24
20201217    24
20201218    24
20201221    24
Name: Symbol, dtype: int64
date
20201222    24
20201223    24
20201224    24
20201225    24
20201228    24
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-42.43,-23.05,-11,-34.78,-49.23,-44.89


In [137]:
for ll in [IC_in1, IC_in2, IC_out1, IC_out2]:
    print('IC')
    print(len(ll))
    date = []
    re = []
    df = pd.DataFrame()
    startDate = [20201113, 20201120, 20201130, 20201207, 20201215, 20201222]
    endDate = [20201119, 20201126, 20201204, 20201211, 20201221, 20201228]
    for i in range(len(startDate)):
        sd = startDate[i]
        ed = endDate[i]
        date.append(str(sd) + ' , ' + str(ed))
        print(beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['Symbol'].nunique())
        re.append((beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['alpha_60d_IC'].mean().sum() * 10000).round(2))
    df['date'] = date
    df['re'] = re
    display(df.T)

IC
33
date
20201113    33
20201116    33
20201117    33
20201118    33
20201119    33
Name: Symbol, dtype: int64
date
20201120    33
20201123    33
20201124    33
20201125    33
20201126    33
Name: Symbol, dtype: int64
date
20201130    33
20201201    33
20201202    33
20201203    33
20201204    33
Name: Symbol, dtype: int64
date
20201207    33
20201208    33
20201209    33
20201210    33
20201211    33
Name: Symbol, dtype: int64
date
20201215    33
20201216    33
20201217    33
20201218    33
20201221    33
Name: Symbol, dtype: int64
date
20201222    33
20201223    33
20201224    33
20201225    33
20201228    33
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-20.55,-25.13,-5.84,47.02,61.16,53.58


IC
17
date
20201113    17
20201116    17
20201117    17
20201118    17
20201119    17
Name: Symbol, dtype: int64
date
20201120    17
20201123    17
20201124    17
20201125    17
20201126    17
Name: Symbol, dtype: int64
date
20201130    17
20201201    17
20201202    17
20201203    17
20201204    17
Name: Symbol, dtype: int64
date
20201207    17
20201208    17
20201209    17
20201210    17
20201211    17
Name: Symbol, dtype: int64
date
20201215    17
20201216    17
20201217    17
20201218    17
20201221    17
Name: Symbol, dtype: int64
date
20201222    17
20201223    17
20201224    17
20201225    17
20201228    17
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,5.22,22.76,40.32,22.81,10.3,25.7


IC
5
date
20201113    5
20201116    5
20201117    5
20201118    5
20201119    5
Name: Symbol, dtype: int64
date
20201120    5
20201123    5
20201124    5
20201125    5
20201126    5
Name: Symbol, dtype: int64
date
20201130    5
20201201    5
20201202    5
20201203    5
20201204    5
Name: Symbol, dtype: int64
date
20201207    5
20201208    5
20201209    5
20201210    5
20201211    5
Name: Symbol, dtype: int64
date
20201215    5
20201216    5
20201217    5
20201218    5
20201221    5
Name: Symbol, dtype: int64
date
20201222    5
20201223    5
20201224    5
20201225    5
20201228    5
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,38.4,7.6,-7.25,26.64,36.23,55.01


IC
45
date
20201113    45
20201116    45
20201117    45
20201118    45
20201119    45
Name: Symbol, dtype: int64
date
20201120    45
20201123    45
20201124    45
20201125    45
20201126    45
Name: Symbol, dtype: int64
date
20201130    45
20201201    45
20201202    45
20201203    45
20201204    45
Name: Symbol, dtype: int64
date
20201207    45
20201208    45
20201209    45
20201210    45
20201211    45
Name: Symbol, dtype: int64
date
20201215    45
20201216    45
20201217    45
20201218    45
20201221    45
Name: Symbol, dtype: int64
date
20201222    45
20201223    45
20201224    45
20201225    45
20201228    45
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-5.65,-19.33,-30.01,-43.53,-42.93,-27.72


In [138]:
for ll in [CSI1000_in1, CSI1000_in2, CSI1000_out1, CSI1000_out2]:
    print('CSI1000')
    print(len(ll))
    date = []
    re = []
    df = pd.DataFrame()
    startDate = [20201113, 20201120, 20201130, 20201207, 20201215, 20201222]
    endDate = [20201119, 20201126, 20201204, 20201211, 20201221, 20201228]
    for i in range(len(startDate)):
        sd = startDate[i]
        ed = endDate[i]
        date.append(str(sd) + ' , ' + str(ed))
        print(beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['Symbol'].nunique())
        re.append((beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['alpha_60d_CSI1000'].mean().sum() * 10000).round(2))
    df['date'] = date
    df['re'] = re
    display(df.T)

CSI1000
80
date
20201113    80
20201116    80
20201117    80
20201118    80
20201119    80
Name: Symbol, dtype: int64
date
20201120    80
20201123    80
20201124    80
20201125    80
20201126    80
Name: Symbol, dtype: int64
date
20201130    80
20201201    80
20201202    80
20201203    80
20201204    80
Name: Symbol, dtype: int64
date
20201207    80
20201208    80
20201209    80
20201210    80
20201211    80
Name: Symbol, dtype: int64
date
20201215    80
20201216    80
20201217    80
20201218    80
20201221    80
Name: Symbol, dtype: int64
date
20201222    80
20201223    80
20201224    80
20201225    80
20201228    80
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-7.57,-17.83,-17.91,1.39,-3.54,-31.3


CSI1000
20
date
20201113    20
20201116    20
20201117    20
20201118    20
20201119    20
Name: Symbol, dtype: int64
date
20201120    20
20201123    20
20201124    20
20201125    20
20201126    20
Name: Symbol, dtype: int64
date
20201130    20
20201201    20
20201202    20
20201203    20
20201204    20
Name: Symbol, dtype: int64
date
20201207    20
20201208    20
20201209    20
20201210    20
20201211    20
Name: Symbol, dtype: int64
date
20201215    20
20201216    20
20201217    20
20201218    20
20201221    20
Name: Symbol, dtype: int64
date
20201222    20
20201223    20
20201224    20
20201225    20
20201228    20
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,18.47,11.49,-4.92,-6.1,15.46,54.87


CSI1000
21
date
20201113    21
20201116    21
20201117    21
20201118    21
20201119    21
Name: Symbol, dtype: int64
date
20201120    21
20201123    21
20201124    21
20201125    21
20201126    21
Name: Symbol, dtype: int64
date
20201130    21
20201201    21
20201202    21
20201203    21
20201204    21
Name: Symbol, dtype: int64
date
20201207    21
20201208    21
20201209    21
20201210    21
20201211    21
Name: Symbol, dtype: int64
date
20201215    21
20201216    21
20201217    21
20201218    21
20201221    21
Name: Symbol, dtype: int64
date
20201222    21
20201223    21
20201224    21
20201225    21
20201228    21
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-22.4,-19.94,-14.79,44.64,50.54,49.47


CSI1000
79
date
20201113    79
20201116    79
20201117    79
20201118    79
20201119    79
Name: Symbol, dtype: int64
date
20201120    79
20201123    79
20201124    79
20201125    79
20201126    79
Name: Symbol, dtype: int64
date
20201130    79
20201201    79
20201202    79
20201203    79
20201204    79
Name: Symbol, dtype: int64
date
20201207    79
20201208    79
20201209    79
20201210    79
20201211    79
Name: Symbol, dtype: int64
date
20201215    79
20201216    79
20201217    79
20201218    79
20201221    79
Name: Symbol, dtype: int64
date
20201222    79
20201223    79
20201224    79
20201225    79
20201228    79
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,52.47,49.77,52.74,3.91,-2.91,6.92


In [140]:
for ll in [CSIRest_out1, CSIRest_out2]:
    print('CSIRest')
    print(len(ll))
    date = []
    re = []
    df = pd.DataFrame()
    startDate = [20201113, 20201120, 20201130, 20201207, 20201215, 20201222]
    endDate = [20201119, 20201126, 20201204, 20201211, 20201221, 20201228]
    for i in range(len(startDate)):
        sd = startDate[i]
        ed = endDate[i]
        date.append(str(sd) + ' , ' + str(ed))
        print(beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['Symbol'].nunique())
        re.append((beta[(beta['date'] >= sd) & (beta['date'] <= ed) & (beta['Symbol'].isin(ll))].groupby('date')['alpha_60d_CSI1000'].mean().sum() * 10000).round(2))
    df['date'] = date
    df['re'] = re
    display(df.T)

CSIRest
66
date
20201113    66
20201116    66
20201117    66
20201118    66
20201119    66
Name: Symbol, dtype: int64
date
20201120    66
20201123    66
20201124    66
20201125    66
20201126    66
Name: Symbol, dtype: int64
date
20201130    66
20201201    66
20201202    66
20201203    66
20201204    66
Name: Symbol, dtype: int64
date
20201207    66
20201208    66
20201209    66
20201210    66
20201211    66
Name: Symbol, dtype: int64
date
20201215    66
20201216    66
20201217    66
20201218    66
20201221    66
Name: Symbol, dtype: int64
date
20201222    66
20201223    66
20201224    66
20201225    66
20201228    66
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,-7.55,-22.57,-23.35,13.68,15.75,3.22


CSIRest
1
date
20201113    1
20201116    1
20201117    1
20201118    1
20201119    1
Name: Symbol, dtype: int64
date
20201120    1
20201123    1
20201124    1
20201125    1
20201126    1
Name: Symbol, dtype: int64
date
20201130    1
20201201    1
20201202    1
20201203    1
20201204    1
Name: Symbol, dtype: int64
date
20201207    1
20201208    1
20201209    1
20201210    1
20201211    1
Name: Symbol, dtype: int64
date
20201215    1
20201216    1
20201217    1
20201218    1
20201221    1
Name: Symbol, dtype: int64
date
20201222    1
20201223    1
20201224    1
20201225    1
20201228    1
Name: Symbol, dtype: int64


Unnamed: 0,0,1,2,3,4,5
date,"20201113 , 20201119","20201120 , 20201126","20201130 , 20201204","20201207 , 20201211","20201215 , 20201221","20201222 , 20201228"
re,940.72,959.35,718.81,228.56,74.9,148.62


In [66]:
# 之前用的是这种方法，好像是忘记改了，具体记不清楚了，实在不行重新刷一遍结果，只有股票当日停盘的时候结果有区别
beta[(beta['Symbol'].isin(In1['StockID'])) & (beta['Date'] >= 20200608) & (beta['Date'] <= 20200612)].groupby('Symbol')['alpha'].sum().mean() * 10000

-70.94328928619402

In [67]:
# 最初用的是这种方法，2019，2020
beta[(beta['Symbol'].isin(In1['StockID'])) & (beta['Date'] >= 20200608) & (beta['Date'] <= 20200612)].groupby('Date')['alpha'].mean().sum() * 10000

-65.92370359426526