### Import Package

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from stockstats import StockDataFrame

from sklearn.model_selection import train_test_split
from sklearn import linear_model

### Set the data source path

In [2]:
# Set the data source path
interval = "daily"
region = "us"
ex_product = "nasdaq stocks"
section = "1"
stock = "aapl"
data_path = "test_data/"+interval+"/"+region+"/"+ex_product+"/"+section+"/"+stock+"."+region+".txt"

column_to_use = ["OPEN","LOW","HIGH","CLOSE"]

### Load the stock data

In [3]:
# Load the data
ori_data = pd.read_csv(data_path, sep=",")

# Rename the column names
ori_data.columns = [colname[1:-1] for colname in ori_data.columns]

# Drop the unnecessary
ori_data.index = ori_data["DATE"]
ori_data = ori_data.drop(columns=['DATE','PER','TIME', 'TICKER', 'OPENINT'])
ori_data.columns = ["open","high","low","close","volume"]

In [4]:
x = StockDataFrame(ori_data)
data = x[['open','high','low','close','volume',
          'boll', 'boll_ub', 'boll_lb',
          'macd', 'macdh', 'macds',
          'rsi_11', 'rsi_14', 'rsi_21']]

### Split the train and test data

In [5]:
def custom_split(data,start,end):
    train = (data.index >= start) & (data.index <= end)
    train_X = data[train]
    
    return train_X

In [6]:
train_X = custom_split(data,start = 20130101,end = 20171031)
valid_X = custom_split(data,start = 20171101,end = 20181231)
test_X = custom_split(data,start = 20190101,end = 20201231)

### Label the target result (opening price on 11th day)

In [7]:
# Assume we use 10 days price data to predict closing price of the 11th day
num_day_to_predict = 10

In [8]:
def produce_result_target_price(X,num_day,result_col_name = "result_price"):
    y = pd.DataFrame(np.nan, index=X.index, columns=[result_col_name])
    for i in range(len(X)-num_day):
        y.iloc[i+10,0] = X.iloc[i+num_day,0]
    return y

In [9]:
train_y = produce_result_target_price(train_X,num_day_to_predict)
valid_y = produce_result_target_price(valid_X,num_day_to_predict)
test_y = produce_result_target_price(test_X,num_day_to_predict)

### Transform the 10-day data into one vector

In [10]:
def transform_x_data_to_one_vector(X,num_day = 10):
    col_name = []
    for i in range(num_day):
        for j in ["OPEN","LOW","HIGH","CLOSE"]:
            col_name.append(j+"-"+str(i+1))
    new_X = pd.DataFrame(np.nan, index=X.index, columns=col_name)
    
    result_range = range(X.index[num_day-1],X.index[-1]+1)
    for i in result_range:
        for col in col_name:
            split_list = col.split("-")
            new_X.loc[i,col] = X.loc[i-num_day+int(split_list[1]),split_list[0]]
    
    return new_X

In [11]:
train_X = transform_x_data_to_one_vector(train_X,num_day_to_predict)
test_X = transform_x_data_to_one_vector(test_X,num_day_to_predict)

KeyError: 20130106

### Drop out rows with NaN

In [None]:
def drop_nan_row(X,y,num_day = 10):
    drop_list = [X.index[0] + i for i in range(num_day-1)]
    drop_list.append(X.index[-1])
    return (X.drop(drop_list),y.drop(drop_list))

In [None]:
train_X,train_y = drop_nan_row(train_X,train_y,num_day_to_predict)
test_X,test_y = drop_nan_row(test_X,test_y,num_day_to_predict)

old_train_X,old_train_y = train_X,train_y
old_test_X,old_test_y = test_X,test_y

### Normalize data row by row

In [None]:
def normalize_data_by_row(X,y):
    norm_X = X.sub(X.mean(axis=1), axis=0).div(X.std(axis=1), axis=0)
    norm_y = y.sub(X.mean(axis=1), axis=0).div(X.std(axis=1), axis=0)
    return (norm_X,norm_y)

In [None]:
#train_X,train_y = normalize_data_by_row(train_X,train_y)
#test_X,test_y = normalize_data_by_row(test_X,test_y)

### Model Training and Fitting

In [None]:
# Use Linear Regression to fit the data
weight = [i/10 for i in range(len(train_X))]

lm_reg = linear_model.LinearRegression()
lm_reg.fit(train_X, train_y, weight)

In [None]:
print(lm_reg.coef_)

### Model Testing

In [None]:
pred_y = lm_reg.predict(test_X)
pred_y[-10:]

In [None]:
test_y.tail(10)

### Plot the graph

In [None]:
plt.plot(test_y.index, test_y["result_price"], label = "Actual", color = 'Black')
plt.plot(test_y.index, pred_y, label = "Predicted", color = 'Orange')
plt.xlabel("timestamp")
plt.ylabel("Price (USD)")
plt.title("Prediction of "+stock.upper()+" using SVR")

plt.legend()
plt.savefig("plot/LinearModel/"+stock.upper()+"-day("+str(num_day_to_predict)+").jpg",
            dpi=600)
plt.show()




