In [2]:
import pandas as pd
import numpy as np
import os.path as op

IMAGE_WIDTH = {5: 15, 20: 60, 60: 180}
IMAGE_HEIGHT = {5: 32, 20: 64, 60: 96}  


Load Labels

    'Date': The last day of the 20-day rolling window for the chart.
    'StockID': CRSP PERMNO that identifies the stock.
    'MarketCap': Market capitalization in dollar, recorded in thousands.
    'Ret_{t}d': t=5,20,60, next t-day holding period return.
    'Ret_month': Holding period return for the next month, from the current monthend to the next monthend.
    'EWMA_vol': Exponentially weighted volatility (square of daily returns) with alpha as 0.05. One day delay is included.



# Load Training Data

##################################################
##Run this section for the First Time: 
##concatenate all data into one file
##can comment this section after First Time
##################################################

In [3]:
try:
    Label_Train = pd.read_hdf('./Label_Train.h5')
    first_time = False
except:
    first_time = True
    
first_time

False

In [4]:
if first_time:
    Label_Train = pd.DataFrame()
    length_Train = pd.Series(dtype=float)
    for year in range(1993,2000):

        label_df = pd.read_feather(op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather"))
        length_Train.loc[year] = len(label_df)

        if len(Label_Train)==0:
            Label_Train = label_df

        else:
            Label_Train = Label_Train.append(label_df)

    Label_Train.to_hdf('./Label_Train.h5')
    length_Train.to_hdf('./length_Train.h5',key='data')

    #Load images by year
    for year in range(1993,2000):

        if len(length_Train.loc[:year]) ==1:

            Images = np.memmap(
                        op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                        dtype=np.uint8, mode='r+',order='C',
                        shape=(len(Label_Train), IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
        else:

            images = np.memmap(
                        op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                        dtype=np.uint8, mode='r+').reshape(
                                            (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))

            starting = length_Train.loc[:year].iloc[:-1].sum()
            end = length_Train.loc[:year].sum()
            Images[starting:end,:] = images        

# Load Data AFTER 1st Time

In [5]:
Label_Train=pd.read_hdf('./Label_Train.h5',key='data')
length_Train=pd.read_hdf('./length_Train.h5',key='data')
year = 1993
Images = np.memmap(
            op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
            dtype=np.uint8, mode='r+').reshape(
                                        (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
        


In [6]:
len(Label_Train)

694871

# Check & Plot the sample images


In [None]:
#check:

year = 1993
images = np.memmap(
                    op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                   dtype=np.uint8, mode='r+').reshape(
                                        (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
check1 = images[:10]


#for 1997:
check2= Images[length_Train.cumsum().loc[year-1]:
       length_Train.cumsum().loc[year-1]+10]


for i in range(0,10):
    sss = np.sum(np.abs(check1[i] - check2[i]))
    print(sss)
    
    
    

In [None]:
from matplotlib import pyplot as plt
for i in range(5):
    print(i)
    print(check1[i].shape)
    plt.imshow(check1[i], cmap='gray')
    plt.show()
    plt.imshow(check2[i], cmap='gray')
    plt.show()


# Load Test Data

In [7]:
try:
    Label_Train = pd.read_hdf('./Label_Test.h5')
    first_time = False
except:
    first_time = True
print(first_time)    
    
if first_time:     
    Label_Test = pd.DataFrame()
    length_Test = pd.Series(dtype=float)

    for year in range(2000,2020):


        label_df = pd.read_feather(op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_labels_w_delay.feather"))
        length_Test.loc[year] = len(label_df)

        if len(Label_Test)==0:
            Label_Test = label_df
        else:
            Label_Test = Label_Test.append(label_df)

    Label_Test.to_hdf('./Label_Test.h5')
    length_Test.to_hdf('./length_Test.h5',key='data')   


    for year in range(2000,2020):

        if len(length_Test.loc[:year]) ==1:

            Images = np.memmap(
                        op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                        dtype=np.uint8, mode='r+',order='C',
                        shape=(len(Label_Test), IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
        else:

            images = np.memmap(
                        op.join("./CNN/img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                        dtype=np.uint8, mode='r+').reshape(
                                            (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))

            starting = length_Test.loc[:year].iloc[:-1].sum()
            end = length_Test.loc[:year].sum()
            Images[starting:end,:] = images                

False


In [8]:
year = 2000
Label_Test=pd.read_hdf('./Label_Test.h5')
Images_Test = np.memmap(
            op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
            dtype=np.uint8, mode='r+').reshape(
                                        (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))


# 1. shrink image sizes: for robustness check

In [None]:
!pip3 install opencv-python


In [None]:

import cv2
import matplotlib.pyplot as plt

img = Images[602000]    
print("original image shape:",img.shape)
plt.imshow(img, cmap='gray')
plt.show()
 

def resize(img,scale_percent = 50):    
    # percent of original size
    width = img.shape[1] 
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)

    # resize image
    resized = cv2.resize(img, dim)
    return np.where(resized>=((255 * scale_percent / 100)/2),
                    255, 
                    resized)



In [None]:
resized = resize(img)
print('Resized Dimensions : ',resized.shape)
plt.imshow(resized, cmap='gray')
plt.show()


resized = resize(img,scale_percent = 75)
print('Resized Dimensions : ',resized.shape)
plt.imshow(resized, cmap='gray')
plt.show()
 

In [None]:
scale_percent = 50
height = int(IMAGE_HEIGHT[20] * scale_percent / 100)
Images_re = np.memmap(
                op.join(
                "./img_data/monthly_20d", 
                f"20d_month_has_vb_[20]_ma_resized_"+str(
                    scale_percent)+"_images_train.dat"), 
                dtype=np.uint8, mode='w+',order='C',
                shape=(len(Label_Train), height, IMAGE_WIDTH[20]))
for i in range(0,len(Images)):
    Images_re[i] = resize(Images[i],scale_percent =scale_percent)


    
scale_percent = 50
height = int(IMAGE_HEIGHT[20] * scale_percent / 100)
Images_re = np.memmap(
                op.join(
                "./img_data/monthly_20d", 
                f"20d_month_has_vb_[20]_ma_resized_"+str(
                    scale_percent)+"_images_test.dat"), 
                dtype=np.uint8, mode='w+',order='C',
                shape=(len(Label_Test), height, IMAGE_WIDTH[20]))
for i in range(0,len(Images)):
    Images_re[i] = resize(Images_Test[i],scale_percent =scale_percent)


In [None]:
scale_percent = 75
height = int(IMAGE_HEIGHT[20] * scale_percent / 100)
Images_re = np.memmap(
                op.join(
                "./img_data/monthly_20d", 
                f"20d_month_has_vb_[20]_ma_resized_"+str(
                    scale_percent)+"_images_train.dat"), 
                dtype=np.uint8, mode='w+',order='C',
                shape=(len(Label_Train), height, IMAGE_WIDTH[20]))
for i in range(0,len(Images)):
    Images_re[i] = resize(Images[i],scale_percent =scale_percent)


    
scale_percent = 75
height = int(IMAGE_HEIGHT[20] * scale_percent / 100)
Images_re = np.memmap(
                op.join(
                "./img_data/monthly_20d", 
                f"20d_month_has_vb_[20]_ma_resized_"+str(
                    scale_percent)+"_images_test.dat"), 
                dtype=np.uint8, mode='w+',order='C',
                shape=(len(Label_Test), height, IMAGE_WIDTH[20]))
for i in range(0,len(Images)):
    Images_re[i] = resize(Images_Test[i],scale_percent =scale_percent)

    
    
    
img = Images[600000]
print("original image shape:",img.shape)
plt.imshow(img, cmap='gray')
plt.show()


img = Images_re[600000]
print("resize image shape:",img.shape)
plt.imshow(img, cmap='gray')
plt.show()


# 2. Normalized Returns for Extension

In [9]:
def uniformize(score,validity):
    score_mask = score.mask(~validity)
    score_mask = score_mask.rank(axis=1)
    score_mask = score_mask.subtract(score_mask.min(axis=1),axis=0)
    
    score_mask = score_mask.div(
        score_mask.max(axis=1) - score_mask.min(axis=1),
        axis=0)
    
    score_mask = score_mask.subtract(score_mask.median(axis=1),axis=0)
    
    return score_mask

In [10]:
pred_field='Ret_20d'

In [25]:
Label_Test=pd.read_hdf('./Label_Test.h5',key='data')
Label_Raw=pd.read_hdf('./Label_Train.h5',key='data')

label_data = Label_Raw.append(Label_Test)


  label_data = Label_Raw.append(Label_Test)


In [12]:
return_pred = label_data.set_index(['Date','StockID'])[pred_field].sort_index().unstack()
return_pred.to_hdf('./return_pred.h5',key='return') 


  check_attribute_name(name)


In [13]:
return_pred_normalized = uniformize(return_pred,return_pred.notnull())
return_pred_normalized.to_hdf('./predictor_varialbes.h5',
                             key='normalized_return')

In [14]:
marketcap = label_data.set_index(['Date','StockID'])['MarketCap'].sort_index().unstack()
vol = label_data.set_index(['Date','StockID'])['EWMA_vol'].sort_index().unstack()
returns = return_pred.shift()

In [15]:
sharpe_pred = return_pred / vol.shift(-1)
sharpe_pred_normalized = uniformize(sharpe_pred,sharpe_pred.notnull())


sharpe_pred_normalized.to_hdf('./predictor_varialbes.h5',
                             key='normalized_sharpe')

In [16]:
#beta adjusted return

In [17]:
marketcap_weight = marketcap.where(return_pred.notnull())
marketcap_weight = marketcap_weight.div(
    marketcap_weight.sum(axis=1),axis=0)
benchmark = (marketcap_weight * return_pred).sum(axis=1)

In [18]:
rolling_endog = return_pred.rolling(60,24)
rolling_exog = benchmark.rolling(60,24)
rolling_corr = rolling_endog.corr(benchmark)
rolling_endog_std = rolling_endog.std()
rolling_exog_std = rolling_exog.std()

beta = rolling_corr * rolling_endog_std.div(rolling_exog_std,axis=0)
beta.stack().describe()

count    2.168390e+06
mean     1.028526e+00
std      9.483866e-01
min     -2.013754e+01
25%      4.230109e-01
50%      9.005747e-01
75%      1.461695e+00
max      4.101963e+01
dtype: float64

In [19]:
adj_return_next = return_pred - beta.shift().multiply(benchmark,axis=0)
adj_return_normalized = uniformize(adj_return_next,adj_return_next.notnull())
adj_return_normalized

StockID,10001,10002,10003,10009,10010,10011,10012,10016,10018,10019,...,93427,93428,93429,93430,93431,93432,93433,93434,93435,93436
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-29,,,,,,,,,,,...,,,,,,,,,,
1993-02-26,,,,,,,,,,,...,,,,,,,,,,
1993-03-31,,,,,,,,,,,...,,,,,,,,,,
1993-04-30,,,,,,,,,,,...,,,,,,,,,,
1993-05-28,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-30,,,,,,,,,,,...,0.070014,,,,,,,-0.396141,,0.271873
2019-09-30,,,,,,,,,,,...,0.180209,,,,,,,-0.288817,,0.491754
2019-10-31,,,,,,,,,,,...,0.284318,,,,,,,-0.087583,,0.224048
2019-11-29,,,,,,,,,,,...,0.277643,,,,,,,-0.434423,,0.454197


In [20]:
adj_return_normalized.to_hdf('./predictor_varialbes.h5',
                             key='normalized_adjusted_return')

In [21]:
Label_Test = Label_Test.reset_index().set_index(['Date','StockID'])
inds = Label_Test[Label_Test[pred_field].notnull()].index

Label_Test['return_pred_normalized'] = return_pred_normalized.stack().reindex(inds)
Label_Test['sharpe_pred_normalized'] = sharpe_pred_normalized.stack().reindex(inds)
Label_Test['adj_return_normalized'] = adj_return_normalized.stack().reindex(inds)


Label_Test = Label_Test.reset_index().set_index('index')

Label_Test.to_hdf('./Label_Test_Addtional_Predictors.h5',key='data')

In [22]:
Label_Raw = Label_Raw.reset_index().set_index(['Date','StockID'])
inds = Label_Raw[Label_Raw[pred_field].notnull()].index

Label_Raw['return_pred_normalized'] = return_pred_normalized.stack().reindex(inds)
Label_Raw['sharpe_pred_normalized'] = sharpe_pred_normalized.stack().reindex(inds)
Label_Raw['adj_return_normalized'] = adj_return_normalized.stack().reindex(inds)


Label_Raw = Label_Raw.reset_index().set_index('index')

Label_Raw.to_hdf('./Label_Raw_Addtional_Predictors.h5',key='data')