# Synthetic data examples

In this Notebook we will build synthetic data suitable to Alphalens analysis. This is useful to understand how Alphalens expects the input to be formatted and also it is a good testing environment to experiment with Alphalens.

In [2]:
%matplotlib inline
    
from numpy import nan
import pandas as pd
import numpy as np
from pandas import (DataFrame, date_range)
import matplotlib.pyplot as plt
import pymysql
from data_tools.api import *
from utilscht.Data import *
import statsmodels.api as sm

from alphalens.performance import *
from alphalens.tears import ( create_returns_tear_sheet,
                      create_information_tear_sheet,
                      create_turnover_tear_sheet,
                      create_summary_tear_sheet,
                      create_full_tear_sheet,
                      create_event_returns_tear_sheet,
                      create_event_study_tear_sheet)

from alphalens.utils import get_clean_factor_and_forward_returns

DB_INFO = dict(host='192.168.1.234',
               user='winduser',
               password='1qaz@WSX',
               db='wind')

conn = pymysql.connect(**DB_INFO, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

In [168]:
#获取价格数据和一致预期数据
'''
sql = """SELECT S_INFO_WINDCODE, TRADE_DT, S_DQ_ADJOPEN from ASHAREEODPRICES where TRADE_DT between '20170101' and '20191031'"""   
df_open = pd.read_sql_query(sql, conn)
df_open=df_open.rename(columns={"S_INFO_WINDCODE":"sid","TRADE_DT":"DataDate"})
df_open=df_open.sort_values(["sid","DataDate"])
df_open.to_csv("open_price.csv",index=False)
'''
df_open=pd.read_csv("open_price.csv",dtype={"DataDate":str})
df_con_est_eps=pd.read_csv(r"/home/ywang/proj_3/data/con_eps_my.csv",dtype={"DataDate":str})
df_close=pd.read_csv(r"/home/ywang/proj_3/data/close_price.csv",dtype={"DataDate":str})

df_con_est_eps=df_con_est_eps.set_index(["DataDate","sid","REPORTING_PERIOD"]).unstack(level=2)["con_est_eps"].reset_index()

In [174]:
#构造一致预期相关的因子
def con_ep_calcu(df):
    year=df["year"][df.index[0]]
    df["con_ep"]=df[int(year+"1231")]/df["S_DQ_CLOSE"]
    
    month_to_nextyear=df["DataDate"].apply(lambda x:12-int(x[4:6]))
    rolling_eps=df[int(year+"1231")]*month_to_nextyear/12+df[int(str(int(year)+1)+"1231")]*(12-month_to_nextyear)/12
    df["con_ep_rolling"]=rolling_eps/df["S_DQ_CLOSE"]
    
    eps_0=df[int(str(int(year)-1)+"1231")]
    eps_1=df[int(year+"1231")]
    eps_2=df[int(str(int(year)+1)+"1231")]
    growth=np.sqrt(eps_2/eps_0)-1
    df["growth"]=growth
    
    df["con_PEG"]=1/(df["con_ep_rolling"]*df["growth"])
    
    return df

#df_con_est_eps=pd.merge(df_con_est_eps,df_close,on=["sid","DataDate"],how="left")
df_con_est_eps["year"]=df_con_est_eps["DataDate"].apply(lambda x:x[0:4])
df_con_est_eps=df_con_est_eps.groupby("year").apply(con_ep_calcu)

  if sys.path[0] == '':


In [None]:
#进行市值行业中性化的函数
def Neutralize(df):
    factor=np.array(df[factor_name])
    indus_dummy=np.array(pd.get_dummies(df["L1_INDUSTRY"]))
    size=np.array(df["mktcap"])
    
    ols_result=sm.OLS(factor,np.column_stack([size,indus_dummy])).fit()
    factor=ols_result.resid
    
    mu  =  np.mean(factor)
    sigma = np.std(factor)
    factor[factor > mu + 3 * sigma] = mu + 3 * sigma
    factor[factor < mu - 3 * sigma] = mu - 3 * sigma
    factor=(factor-np.mean(factor))/np.std(factor)
    df[factor_name+"_n"]=factor
    
    return df


def Factor_Test(factor_name,df_con_est_eps,df_open):
    #获取因子数据
    factor_data=pd.pivot_table(df_con_est_eps,factor_name,index="DataDate",columns="sid")
    factor_data=factor_data.replace(0,np.nan).replace(np.inf,np.nan).replace(-np.inf,np.nan).fillna(method="ffill")
    factor_data=factor_data.dropna(axis=1)
    factor_data=factor_data.shift(1).dropna()
    
    #获取价格数据
    price_data=pd.pivot_table(df_open,"S_DQ_ADJOPEN",index="DataDate",columns="sid")
    
    #进行适当变换
    factor_data.index.name='date'
    price_data.index.name='date'
    factor_data.index=pd.to_datetime(factor_data.index)
    price_data.index=pd.to_datetime(price_data.index)
    factor_data=factor_data.stack()
    
    #加入市值和行业信息
    indus_size_df=query_table("DailyBar",start_date="20160101",end_date="20191031",fields=["L1_INDUSTRY","mktcap"])
    indus_size_df["date"]=pd.to_datetime(indus_size_df["DataDate"].apply(str))
    factor_data=pd.merge(factor_data.to_frame(factor_name).reset_index(),indus_size_df,on=["sid","date"])
    
    #进行市值行业中性化
    factor_data=factor_data.groupby("date").apply(Neutralize)
    factor_data=factor_data[["date","sid",factor_name+"_n"]].set_index(["date","sid"])
    
    #调用Alphalens 中的函数进行因子分析（分层测试和 IC 测试）
    factor_price_data = get_clean_factor_and_forward_returns(
        factor_data,
        price_data,
        quantiles=10,
        periods=(3,5,10,20,30))

    create_summary_tear_sheet(factor_price_data, long_short=False, group_neutral=False, by_group=False)