In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#### Read Data

In [55]:
symbols = ['TSLA', 'AAPL', 'AMZN', 'GOOG','NFLX', 'FB']
train_df_list = []
test_df_list = []

for symbol in symbols:
    # read in all features for each stock
    df = pd.read_csv('C:/Users/KingO/JupyterProject/stock-market-prediction/data/eod/{}_features.csv'.format(symbol))
    
    # drop non-feature columns
    df = df.drop(['Adj Close','Unnamed: 0','Date'],axis=1)
    
    # extract train/test set
    train_df = df.iloc[0:400]
    test_df = df.iloc[400:]
    
    # standardize
    sc = StandardScaler()
    train_df = sc.fit_transform(train_df)
    test_df = sc.transform(test_df)
    
    # append to the list
    train_df_list.append(pd.DataFrame(train_df))
    test_df_list.append(pd.DataFrame(test_df))
    
# concat values for all 6 stocks
train_df = pd.concat(train_df_list, ignore_index=True)    
test_df = pd.concat(test_df_list, ignore_index=True)

In [56]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,-1.975349,-1.972181,-1.958970,-1.954464,0.673352,-2.190661,-2.441098,-0.400401,-0.563059,-0.011092,...,-0.377249,-0.467095,0.186811,-2.186840,-2.268832,-0.439522,-1.908294,0.751873,0.760454,-1.954464
1,-1.933996,-1.919503,-1.916159,-1.901530,1.256111,-2.024816,-2.334961,0.562690,-0.187977,0.026902,...,-0.211495,-0.420265,0.577741,-1.842577,-2.232505,0.226749,-1.894183,1.718167,1.680302,-1.901530
2,-1.903180,-1.911552,-1.901657,-1.915405,1.003937,-2.093718,-2.431337,-0.127531,-0.295718,-0.006133,...,-0.126017,-0.364469,0.684836,-1.628673,-2.154438,0.537454,-1.889509,-0.590105,-0.576692,-1.915405
3,-1.908705,-1.920375,-1.893848,-1.914231,-0.026875,-2.156176,-2.374860,0.069444,-0.301450,-0.015650,...,-0.063755,-0.306479,0.711043,-1.779503,-2.126540,0.164007,-1.884228,-0.098791,-0.078702,-1.914231
4,-1.907227,-1.922178,-1.912435,-1.929817,0.145328,-2.226232,-2.438002,-0.276057,-0.371192,-0.027519,...,-0.063347,-0.259999,0.573553,-1.832274,-2.116311,0.032479,-1.882934,-0.654602,-0.642819,-1.929817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0.676194,0.813558,0.702094,0.858871,-0.426853,-1.537343,-0.105748,-0.705240,-0.079168,1.376505,...,-0.478143,-0.731403,0.558799,0.137845,0.114279,0.086103,0.882628,1.214749,1.211081,0.858871
2396,0.852921,0.777869,0.809820,0.769343,-0.358189,-1.655931,-0.243519,-0.800680,-0.261129,0.242162,...,-0.491176,-0.694502,0.425091,-0.090821,0.069893,-0.262579,0.882567,-0.594046,-0.587814,0.769343
2397,0.764438,0.829607,0.812208,0.883700,-0.652529,-1.439048,-0.128568,-0.850629,-0.064393,0.297565,...,-0.426949,-0.651061,0.493644,-0.400117,-0.044152,-0.657636,0.884391,0.626612,0.633594,0.883700
2398,0.973689,1.097879,1.042710,1.144885,-0.248971,-1.162557,0.017670,-0.384906,0.450990,2.907313,...,-0.208279,-0.568915,0.899346,-0.471357,-0.153477,-0.630869,0.904562,1.487563,1.476594,1.144885


In [57]:
pca = PCA(n_components=91)
train_df_pca = pd.DataFrame(pca.fit_transform(train_df))
test_df_pca = pd.DataFrame(pca.transform(test_df))
print(pca.n_components_)

91


In [58]:
test_df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,-8.394511,3.788347,-4.709784,2.118773,-3.016938,-1.664326,0.712490,-1.153952,0.483889,-0.387339,...,0.017575,0.009237,0.007303,0.002182,0.011828,0.006044,0.003450,0.000649,-2.188093e-15,-2.607557e-17
1,-8.358329,3.770295,-4.486423,1.356248,-3.044092,-1.889418,0.572163,-1.229672,0.330572,-0.280761,...,0.001893,0.015537,-0.005409,0.001748,0.012095,0.006026,0.008193,0.000860,-2.075224e-16,5.894501e-16
2,-8.592697,3.272314,-4.018176,0.553891,-2.680083,-2.289498,-0.395035,-0.740836,0.241300,-0.433873,...,-0.006909,0.013690,-0.004071,0.003023,0.012800,0.006447,0.008115,0.000130,-7.269900e-16,5.742309e-16
3,-7.023689,8.928867,-5.906660,4.709379,1.357341,0.600026,-1.854043,0.043921,0.275941,-2.453596,...,0.021124,0.005301,-0.001563,0.007645,0.018080,0.007560,0.005336,-0.000179,4.420639e-15,1.367934e-16
4,-8.031387,7.082808,-3.057256,1.246194,-4.022364,-0.414199,-0.443308,-0.925327,0.849820,-1.092529,...,0.009224,-0.004559,0.009406,0.008249,0.017288,0.009012,0.005765,0.000290,-3.412059e-15,1.063818e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,7.485932,-16.448687,2.404343,3.573531,0.365219,0.632121,0.403639,2.029047,-0.667084,-1.284011,...,-0.010080,-0.022410,-0.040318,-0.030964,0.026095,0.003897,-0.004653,0.000377,-9.613966e-15,4.728150e-16
506,7.144278,-18.341475,2.884121,3.264465,-1.827469,1.339940,2.556923,2.391720,-2.026323,-1.444798,...,-0.008263,-0.026053,-0.041456,-0.027462,0.028192,0.003485,-0.004656,0.000378,-8.565974e-15,2.232865e-15
507,7.731665,-19.580497,3.556512,4.975118,-4.109274,3.109665,2.524333,1.480042,-2.276264,-0.555287,...,-0.001230,-0.027654,-0.050814,-0.031896,0.029021,0.003901,-0.004768,0.000453,-1.011921e-14,1.889504e-15
508,12.012551,-9.158129,-2.941580,17.604955,7.266777,5.609825,-3.176060,3.905656,4.727919,-1.285041,...,-0.010928,0.003287,-0.086324,-0.017650,0.028043,0.003016,-0.003084,-0.000577,-1.587378e-14,3.375231e-16


In [59]:
# add one-hot encoding for each stock
encode = np.array([0,1,2,3,4,5])
encode = np.repeat(encode,400)
train_df_pca['encode'] = encode

encode = np.array([0,1,2,3,4,5])
encode = np.repeat(encode,85)
test_df_pca['encode'] = encode

In [60]:
symbols = ['TSLA', 'AAPL', 'AMZN', 'GOOG','NFLX', 'FB']
X_train, y_train, X_test, y_test = [],[],[],[]
sc_params = []
for idx,symbol in enumerate(symbols):
    # train/test set for each stock
    train = train_df_pca.iloc[400*idx:400*(idx+1)]
    test = test_df_pca.iloc[85*idx:85*(idx+1)]
    
    # origin_df contains original EOD features
    origin_df = pd.read_csv('C:/Users/KingO/JupyterProject/stock-market-prediction/data/eod/{}.csv'.format(symbol))
    # standardize origin_df
    sc = StandardScaler()
    origin_train_df = sc.fit_transform(origin_df.loc[0:399,['Open','High','Low','Close']])
    origin_train_df = pd.DataFrame(origin_train_df, columns=['Open','High','Low','Close'])
    origin_test_df = sc.transform(origin_df.loc[400:,['Open','High','Low','Close']])
    origin_test_df = pd.DataFrame(origin_test_df, columns=['Open','High','Low','Close'])
   
    sc_params.append([sc.mean_,sc.var_])
    
    # create time_series training set
    for i in range(len(train)-5):
        X_train.append(train.iloc[i:i+5].to_numpy())
        y_train.append(origin_train_df.loc[i+5,['Open','High','Low','Close']].to_numpy())
    
    # create time_series test set
    for i in range(len(test)-5):
        X_test.append(test.iloc[i:i+5].to_numpy())
        y_test.append(origin_test_df.loc[i+5,['Open','High','Low','Close']].to_numpy())

In [61]:
np.array(X_train).shape

(2370, 5, 92)

In [62]:
np.array(y_train).shape

(2370, 4)

In [63]:
np.array(X_test).shape

(480, 5, 92)

In [64]:
np.array(y_test).shape

(480, 4)

In [65]:
np.array(sc_params).shape

(6, 2, 4)

In [66]:
np.save('C:/Users/KingO/JupyterProject/stock-market-prediction/data/pca_features/X_train.npy', X_train)
np.save('C:/Users/KingO/JupyterProject/stock-market-prediction/data/pca_features/y_train.npy', y_train)
np.save('C:/Users/KingO/JupyterProject/stock-market-prediction/data/pca_features/X_test.npy', X_test)
np.save('C:/Users/KingO/JupyterProject/stock-market-prediction/data/pca_features/y_test.npy', y_test)
np.save('C:/Users/KingO/JupyterProject/stock-market-prediction/data/pca_features/sc_params.npy', sc_params)