In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [74]:
symbols = ['TSLA', 'AAPL', 'AMZN', 'GOOG','NFLX', 'FB']
train_df_list = []
test_df_list = []

for symbol in symbols:
    # read in all features for each stock
    df = pd.read_csv('./data/eod/{}_features.csv'.format(symbol))
    
    # drop non-feature columns
    df = df.drop(['Adj Close','Unnamed: 0','Date'],axis=1)
    
    # extract train/test set
    train_df = df.iloc[0:400]
    test_df = df.iloc[400:]
    
    # standardize
    sc = StandardScaler()
    train_df = sc.fit_transform(train_df)
    test_df = sc.transform(test_df)
    
    # append to the list
    train_df_list.append(pd.DataFrame(train_df))
    test_df_list.append(pd.DataFrame(test_df))
    
# concat values for all 6 stocks
train_df = pd.concat(train_df_list, ignore_index=True)    
test_df = pd.concat(test_df_list, ignore_index=True)  

In [75]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,-1.975349,-1.972181,-1.958970,-1.954464,0.673352,-2.190661,-2.441098,-0.400401,-0.563059,-0.011092,...,-0.377249,-0.467095,0.186811,-2.186840,-2.268832,-0.439522,-1.908294,0.751873,0.760454,-1.954464
1,-1.933996,-1.919503,-1.916159,-1.901530,1.256111,-2.024816,-2.334961,0.562690,-0.187977,0.026902,...,-0.211495,-0.420265,0.577741,-1.842577,-2.232505,0.226749,-1.894183,1.718167,1.680302,-1.901530
2,-1.903180,-1.911552,-1.901657,-1.915405,1.003937,-2.093718,-2.431337,-0.127531,-0.295718,-0.006133,...,-0.126017,-0.364469,0.684836,-1.628673,-2.154438,0.537454,-1.889509,-0.590105,-0.576692,-1.915405
3,-1.908705,-1.920375,-1.893848,-1.914231,-0.026875,-2.156176,-2.374860,0.069444,-0.301450,-0.015650,...,-0.063755,-0.306479,0.711043,-1.779503,-2.126540,0.164007,-1.884228,-0.098791,-0.078702,-1.914231
4,-1.907227,-1.922178,-1.912435,-1.929817,0.145328,-2.226232,-2.438002,-0.276057,-0.371192,-0.027519,...,-0.063347,-0.259999,0.573553,-1.832274,-2.116311,0.032479,-1.882934,-0.654602,-0.642819,-1.929817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0.676194,0.813558,0.702094,0.858871,-0.426853,-1.537343,-0.105748,-0.705240,-0.079168,1.376505,...,-0.478143,-0.731403,0.558799,0.137845,0.114279,0.086103,0.882628,1.214749,1.211081,0.858871
2396,0.852921,0.777869,0.809820,0.769343,-0.358189,-1.655931,-0.243519,-0.800680,-0.261129,0.242162,...,-0.491176,-0.694502,0.425091,-0.090821,0.069893,-0.262579,0.882567,-0.594046,-0.587814,0.769343
2397,0.764438,0.829607,0.812208,0.883700,-0.652529,-1.439048,-0.128568,-0.850629,-0.064393,0.297565,...,-0.426949,-0.651061,0.493644,-0.400117,-0.044152,-0.657636,0.884391,0.626612,0.633594,0.883700
2398,0.973689,1.097879,1.042710,1.144885,-0.248971,-1.162557,0.017670,-0.384906,0.450990,2.907313,...,-0.208279,-0.568915,0.899346,-0.471357,-0.153477,-0.630869,0.904562,1.487563,1.476594,1.144885


In [76]:
pca = PCA(n_components=0.99)
train_df_pca = pd.DataFrame(pca.fit_transform(train_df))
test_df_pca = pd.DataFrame(pca.transform(test_df))
print(pca.n_components_)

39


In [97]:
train_df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,9.945235,-4.383702,-5.500023,-4.394858,1.295633,1.258256,-1.895179,-0.713912,-0.289074,-1.661544,...,-0.984436,0.789325,-0.114320,-0.483182,-0.849170,0.665671,-0.457233,0.681416,0.089157,0.813714
1,11.211483,-1.182106,-6.549181,-2.318488,2.270100,3.114686,-0.872561,-1.543707,-0.362016,-2.441519,...,-0.117916,0.033462,0.122299,-0.028795,-1.078183,0.690630,0.021099,0.482171,0.606678,0.957526
2,10.731581,-1.988550,-5.518597,-3.658372,-0.078595,2.869391,0.004242,-1.582396,-0.225947,-2.015567,...,0.211276,-0.204771,-0.214872,0.342892,-1.133936,0.804152,-0.279302,0.534929,0.025178,0.880931
3,10.602542,-1.893679,-5.128392,-4.433752,-0.331542,1.285424,-1.024170,-1.010405,-0.937394,-1.295700,...,0.182800,-0.197528,-0.224221,0.327034,-0.978247,0.573367,0.204623,0.194453,-0.113258,1.397027
4,9.994837,-3.295780,-3.859830,-5.085320,-0.880126,1.029396,-1.916436,-0.553850,-0.583626,-1.334696,...,-0.258896,0.023889,-0.568271,0.625754,-0.786211,0.617747,0.308127,0.220217,-0.141694,1.033200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,-3.455961,2.084894,-3.286213,2.150153,-1.193742,-1.263942,-2.385558,1.272925,0.364025,-0.518051,...,-0.405985,-1.051932,0.368994,0.434833,0.062539,0.124431,-0.113862,0.156491,0.069380,0.384773
2396,-3.720095,1.640322,-2.499124,0.588297,-2.412435,-1.462977,-1.867978,0.779998,-0.171901,-0.275767,...,-0.396860,-1.493849,0.565600,0.397016,-0.268918,0.581452,-0.055495,0.440253,0.209164,0.268575
2397,-3.451786,2.704194,-3.016126,0.667212,-1.265956,-1.700103,-2.457514,0.897131,-0.658437,-0.567966,...,-0.378603,-1.353396,0.613780,0.329313,0.036846,0.560642,0.014637,0.306956,0.268348,0.677559
2398,-2.517359,5.912208,-3.625448,2.006566,0.279470,-0.554503,-2.597231,0.561925,-0.932864,-1.713621,...,-1.248482,-0.962219,0.449956,0.085145,-0.275809,0.101808,-0.249950,0.462434,-0.085584,0.421412


In [84]:
symbols = ['TSLA', 'AAPL', 'AMZN', 'GOOG','NFLX', 'FB']
X_train, y_train, X_test, y_test = [],[],[],[]
for idx,symbol in enumerate(symbols):
    # train/test set for each stock
    train = train_df_pca.iloc[400*idx:400*(idx+1)]
    test = test_df_pca.iloc[85*idx:85*(idx+1)]
    
    origin_df = pd.read_csv('./data/eod/{}.csv'.format(symbol))
    
    # create time_series training set
    for i in range(len(train)-5):
        X_train.append(train.iloc[i:i+5].to_numpy())
        y_train.append(origin_df.loc[i+5,['Open','High','Low','Close']].to_numpy())
    
    # create time_series test set
    for i in range(len(test)-5):
        X_test.append(test.iloc[i:i+5].to_numpy())
        y_test.append(origin_df.loc[400+i+5,['Open','High','Low','Close']].to_numpy())

In [99]:
np.array(X_train).shape

(2370, 5, 39)

In [101]:
np.array(y_train).shape

(2370, 4)

In [102]:
np.array(X_test).shape

(480, 5, 39)

In [103]:
np.array(y_test).shape

(480, 4)

In [108]:
np.save('./data/X_train.npy', X_train)
np.save('./data/y_train.npy', y_train)
np.save('./data/X_test.npy', X_test)
np.save('./data/y_test.npy', y_test)