In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings("ignore")

import tqdm
pd.set_option("display.max_columns",None)

In [None]:
data2 = pd.read_csv('merge_data.csv',index_col=0)

In [None]:
# Import package
from sklearn.preprocessing import StandardScaler,MinMaxScaler

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet

from sklearn.metrics import r2_score


In [None]:
#The recursive performance evaluation scheme
train_score=[0]*30
validation_score=[0]*30
test_score=[0]*30

In [None]:
#OLS+H
from sklearn.linear_model import SGDRegressor

bestRSqr = float("-inf")
best_data1 = None
bestOLS_H = None
best_year = 0

for i in range(0,30):
    print('Batch {}:\nTrain data:1960-{}\nValuation data:{}-{}\nTest data:{}-{}'.format(i,1978+i,1978+i,1990+i,1990+i,1991+i))
    data_train=data2[(data2.yyyymm>=196001)&(data2.yyyymm<197801+i*100)]
    data_validation=data2[(data2.yyyymm>=197801+i*100)&(data2.yyyymm<199001+i*100)]
    data_test=data2[(data2.yyyymm>=196001+i*100)&(data2.yyyymm<196101+i*100)]

    #Normalize
    X_train=data_train.drop("excess_ret",axis=1).copy()
    X_train=scaler.fit_transform(X_train)
    X_validation=data_validation.drop("excess_ret",axis=1).copy()
    X_validation=scaler.transform(X_validation)
    X_test=data_test.drop("excess_ret",axis=1).copy()
    X_test=scaler.transform(X_test)

    y_train=data_train[["excess_ret"]].copy()
    y_validation=data_validation[["excess_ret"]].copy()
    y_test=data_test[["excess_ret"]].copy()
    y_train=scaler.fit_transform(y_train)
    y_test=scaler.transform(y_test)
    y_validation=scaler.transform(y_validation)
    
    OLS_H = SGDRegressor(loss='huber',alpha=1e-3,epsilon=0.05,learning_rate='optimal')
    OLS_H.fit(X_train, y_train)
    y_train_pred=OLS_H.predict(X_train)
    train_score[i]=r2_score(y_train,y_train_pred)
    y_validation_pred=OLS_H.predict(X_validation)
    validation_score[i]=r2_score(y_validation,y_validation_pred)
    y_test_pred=OLS_H.predict(X_test)
    test_score[i]=r2_score(y_test,y_test_pred)
    currentRSqr = test_score[i]

    if(currentRSqr>bestRSqr):
        bestRSqr=currentRSqr
        bestOLS_H = OLS_H
        best_year = 196001+i*100
        
    del data_train,data_validation,data_test,X_train,y_train,X_test,y_test,X_validation,y_validation

In [None]:
# PRINT RESULT
test_score=pd.DataFrame(test_score)
print(test_score)

In [None]:
# Performence of OLS_H
plt.figure(figsize=(10,5))
plt.title("OLS_H Performance")
year=np.arange(1990,2020)
plt.plot(year,test_score.values)
plt.xlabel("year")
plt.ylabel("score--R-square")

In [None]:
#The Importance 
#X_test0 = best_data1.drop(best_data1.columns[0],axis=1,inplace=True).copy()

X_test0 = data2[(data2.yyyymm>=best_year)&(data2.yyyymm<best_year+100)]
print(X_test0.shape)
a = X_test0.drop("excess_ret",axis=1).copy()
print(a.shape)
id = a.columns
L = len(a.columns)
train_score=[0]*L
validation_score=[0]*L
test_score1 = [0]*L
importance_OLS_H=[0]*L
L

In [None]:
#Variable importance

for i in range(0,L):
    
    X_test = X_test0.drop("excess_ret",axis=1).copy()
    X_test.loc[:,id[i]]=0
    X_test=scaler.fit_transform(X_test)

    y_test=X_test0[["excess_ret"]].copy()
    y_test=scaler.fit_transform(y_test)
    

    y_test_pred=bestOLS_H.predict(X_test)
    test_score1[i] = r2_score(y_test,y_test_pred)
    importance_OLS_H[i] = bestRSqr - test_score1[i]

    
    del X_test,y_test

In [None]:
# PLOT
importance_OLS_H = np.abs(importance_OLS_H)
importance_OLS_H1 = pd.DataFrame(importance_OLS_H,columns=['Importance'], index=id)
importance_OLS_H1 = importance_OLS_H1.sort_values(by='Importance',ascending=True)
importance_OLS_H1 = importance_OLS_H1.tail(20)
importance_OLS_H1.plot(kind='barh', figsize=(9, 7))

In [None]:
# Variable importance BY COE
import matplotlib.pyplot as plt

a = data2.columns
id = a.drop("excess_ret").copy()

importance_OLS_H = OLS_H.coef_
importance_OLS_H = np.abs(importance_OLS_H)
coefs_OLS_H = pd.DataFrame(importance_OLS_H,columns=['Importance'], index=id)
coefs_OLS_H = coefs_OLS_H.sort_values(by='Importance',ascending=True)
coefs_OLS_H= coefs_OLS_H.tail(20)
coefs_OLS_H.plot(kind='barh', figsize=(9, 7))

In [None]:
#The recursive performance evaluation scheme
train_score=[0]*30
validation_score=[0]*30
test_score=[0]*30

In [None]:
# OLS3
from sklearn.linear_model import SGDRegressor

bestRSqr = float("-inf")
best_data1 = None
bestOLS_H3 = None
best_year = 0

for i in range(0,30):
    print('Batch {}:\nTrain data:1960-{}\nValuation data:{}-{}\nTest data:{}-{}'.format(i,1978+i,1978+i,1990+i,1990+i,1991+i))
    data_train=data2[(data2.yyyymm>=196001)&(data2.yyyymm<197801+i*100)]
    data_validation=data2[(data2.yyyymm>=197801+i*100)&(data2.yyyymm<199001+i*100)]
    data_test=data2[(data2.yyyymm>=196001+i*100)&(data2.yyyymm<196101+i*100)]

    #Normalize
    X_train=data_train[['mom12m','bm','mvel1']].copy()
    X_train=scaler.fit_transform(X_train)
    X_validation=data_validation[['mom12m','bm','mvel1']].copy()
    X_validation=scaler.transform(X_validation)
    X_test=data_test[['mom12m','bm','mvel1']].copy()
    X_test=scaler.transform(X_test)

    y_train=data_train[["excess_ret"]].copy()
    y_validation=data_validation[["excess_ret"]].copy()
    y_test=data_test[["excess_ret"]].copy()
    y_train=scaler.fit_transform(y_train)
    y_test=scaler.transform(y_test)
    y_validation=scaler.transform(y_validation)
    
    OLS_H = SGDRegressor(loss='huber',alpha=1e-3,epsilon=0.05,learning_rate='optimal')
    OLS_H.fit(X_train, y_train)
    y_train_pred=OLS_H.predict(X_train)
    train_score[i]=r2_score(y_train,y_train_pred)
    y_validation_pred=OLS_H.predict(X_validation)
    validation_score[i]=r2_score(y_validation,y_validation_pred)
    y_test_pred=OLS_H.predict(X_test)
    test_score[i]=r2_score(y_test,y_test_pred)
    currentRSqr = test_score[i]

    if(currentRSqr>bestRSqr):
        bestRSqr=currentRSqr
        bestOLS_H3 = OLS_H
        best_year = 196001+i*100
        
    del data_train,data_validation,data_test,X_train,y_train,X_test,y_test,X_validation,y_validation

In [None]:
# RESULT
test_score=pd.DataFrame(test_score)
print(test_score)

In [None]:
# Enet best parameter
import numpy as np
from sklearn import linear_model
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import SGDRegressor

# store r2_score
rList=[]
combineList = []

for i in range(30):
    df_train = data2[(data2.yyyymm>=195703)&(data2.yyyymm<197503)]
    df_validation = data2[(data2.yyyymm>=197503)&(data2.yyyymm<198703)]
    df_test = data2[(data2.yyyymm>=198703)&(data2.yyyymm<201603)]

    df_train = df_train.copy()
    df_validation = df_validation.copy()
    df_test = df_test.copy()

    trainingstart = 195703
    trainingend = 197503 + i*100
    validend = trainingend + 1200
    testend = validend + 100

    trainingMask = (data2.yyyymm >= trainingstart) & (data2.yyyymm< trainingend)
    trainingData = data2.loc[trainingMask]
    
    validationMask = (data2.yyyymm >= trainingend) & (data2.yyyymm< validend)
    validationData = data2.loc[validationMask]
    
    testMask = (data2.yyyymm >= validend) & (data2.yyyymm < testend)
    testData = data2.loc[testMask]
    
    
    trainingData_y = trainingData['excess_ret']
    trainingData_x = trainingData.drop(['excess_ret','permno', 'yyyymm'], axis=1)
    
    validationData_y = validationData['excess_ret']
    validationData_x = validationData.drop(['excess_ret','permno', 'yyyymm'], axis=1)
    
    testData_y = testData['excess_ret']
    testData_x = testData.drop(['excess_ret','permno', 'yyyymm'], axis =1)
    
    bestRSqr = float("-inf")
    bestEnet = None
    bestCombine = None
    
    for alpha in [10e-04,10e-01]:
        for l1 in [0,0.25,0.5,0.75,1]:
            ENreg_H = SGDRegressor(loss='huber',penalty='elasticnet',alpha=alpha,  l1_ratio=l1, epsilon=0.05,max_iter=1e6,shuffle=False)
            ENreg_H.fit(trainingData_x,trainingData_y)
    
            pred = ENreg_H.predict(validationData_x)
            currentRSqr = r2_score(validationData_y.values, pred)
            
            if(currentRSqr>bestRSqr):
                bestRSqr=currentRSqr
                bestEnet=ENreg_H
                bestCombine=(alpha,l1)
        
    resultPred = bestEnet.predict(testData_x)
    resultRSqr = r2_score(testData_y.values, resultPred)
    print('result R square for batch',i,': ', resultRSqr, 'best combine:', bestCombine)
    rList.append(resultRSqr)
    combineList.append(bestCombine)

In [None]:
#The recursive performance evaluation scheme
train_score1=[0]*30
validation_score1=[0]*30
test_score1=[0]*30

In [None]:
# Enet
from sklearn.linear_model import SGDRegressor

bestRSqr = float("-inf")
bestEnet = None
best_year = 0

for i in range(0,30):
    print('Batch {}:\nTrain data:1960-{}\nValuation data:{}-{}\nTest data:{}-{}'.format(i,1978+i,1978+i,1990+i,1990+i,1991+i))
    data_train=data2[(data2.yyyymm>=196001)&(data2.yyyymm<197801+i*100)]
    data_validation=data2[(data2.yyyymm>=197801+i*100)&(data2.yyyymm<199001+i*100)]
    data_test=data2[(data2.yyyymm>=196001+i*100)&(data2.yyyymm<196101+i*100)]

    #Normalize
    X_train=data_train.drop("excess_ret",axis=1).copy()
    X_train=scaler.fit_transform(X_train)
    X_validation=data_validation.drop("excess_ret",axis=1).copy()
    X_validation=scaler.transform(X_validation)
    X_test=data_test.drop("excess_ret",axis=1).copy()
    X_test=scaler.transform(X_test)

    y_train=data_train[["excess_ret"]].copy()
    y_validation=data_validation[["excess_ret"]].copy()
    y_test=data_test[["excess_ret"]].copy()
    y_train=scaler.fit_transform(y_train)
    y_test=scaler.transform(y_test)
    y_validation=scaler.transform(y_validation)
    
    ENreg_H = SGDRegressor(loss='huber',penalty='elasticnet',alpha=1e-3,l1_ratio=0.5, epsilon=0.05,max_iter=1e6,shuffle=False)
    ENreg_H.fit(X_train,y_train)
    y_train_pred=ENreg_H.predict(X_train)
    train_score1[i]=r2_score(y_train,y_train_pred)
    y_validation_pred=ENreg_H.predict(X_validation)
    validation_score1[i]=r2_score(y_validation,y_validation_pred)
    y_test_pred=ENreg_H.predict(X_test)
    test_score1[i]=r2_score(y_test,y_test_pred)
    currentRSqr = test_score1[i]

    if(currentRSqr>bestRSqr):
        bestRSqr=currentRSqr
        bestEnet=ENreg_H
        best_year = 196001+i*100
    
    del data_train,data_validation,data_test,X_train,y_train,X_test,y_test,X_validation,y_validation

In [None]:
# results
test_score=pd.DataFrame(test_score1)
print(test_score)

In [None]:
# Performence of ENET-H
plt.figure(figsize=(10,5))
plt.title("ENT+H Performance")
year=np.arange(1990,2020)
plt.plot(year,test_score1)
plt.xlabel("year")
plt.ylabel("score--R-square")

In [None]:
#The Importance 
#X_test0 = best_data1.drop(best_data1.columns[0],axis=1,inplace=True).copy()

X_test0 = data2[(data2.yyyymm>=best_year)&(data2.yyyymm<best_year+100)]
print(X_test0.shape)
a = X_test0.drop("excess_ret",axis=1).copy()
print(a.shape)
id = a.columns
L = len(a.columns)
train_score=[0]*L
validation_score=[0]*L
test_score1 = [0]*L
importance_ENreg_H=[0]*L
L

In [None]:
#Variable importance

for i in range(0,L):
    
    X_test = X_test0.drop("excess_ret",axis=1).copy()
    X_test.loc[:,id[i]]=0
    X_test=scaler.fit_transform(X_test)

    y_test=X_test0[["excess_ret"]].copy()
    y_test=scaler.fit_transform(y_test)
    

    y_test_pred=bestEnet.predict(X_test)
    test_score1[i] = r2_score(y_test,y_test_pred)
    importance_ENreg_H[i] = bestRSqr - test_score1[i]

    
    del X_test,y_test,

In [None]:
#plot
importance_ENreg_H = np.abs(importance_ENreg_H)
importance_ENreg_H1 = pd.DataFrame(importance_ENreg_H,columns=['Importance'], index=id)
importance_ENreg_H1 = importance_ENreg_H1.sort_values(by='Importance',ascending=True)
importance_ENreg_H1 = importance_ENreg_H1.tail(20)
importance_ENreg_H1.plot(kind='barh', figsize=(9, 7))