In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('aapl.us.csv')

# discard data before year 2005
df = df[(df['Date'] >= '2005-01-01')]
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
5125,2005-01-03,4.1529,4.1685,4.0083,4.0519,193279926,0
5126,2005-01-04,4.0825,4.1915,4.0314,4.0940,306165093,0
5127,2005-01-05,4.0940,4.1775,4.0940,4.1300,189804833,0
5128,2005-01-06,4.1491,4.1556,4.0545,4.1324,196766476,0
5129,2005-01-07,4.1646,4.4578,4.1469,4.4336,623374174,0
...,...,...,...,...,...,...,...
8359,2017-11-06,171.7500,174.3600,171.1000,173.6300,34901241,0
8360,2017-11-07,173.2900,174.5100,173.2900,174.1800,24424877,0
8361,2017-11-08,174.0300,175.6100,173.7100,175.6100,24451166,0
8362,2017-11-09,174.4800,175.4600,172.5200,175.2500,29533086,0


In [8]:
# daily: 5 day to predict next 1 day, X: close price and volume, y: close price

df_day_5_1 = pd.DataFrame()

# volume
df_day_5_1['Volume (t-5)'] = df['Volume']
df_day_5_1['Volume (t-4)'] = df['Volume'].shift(-1)
df_day_5_1['Volume (t-3)'] = df['Volume'].shift(-2)
df_day_5_1['Volume (t-2)'] = df['Volume'].shift(-3)
df_day_5_1['Volume (t-1)'] = df['Volume'].shift(-4)

# close price
df_day_5_1['Close (t-5)'] = df['Close']
df_day_5_1['Close (t-4)'] = df['Close'].shift(-1)
df_day_5_1['Close (t-3)'] = df['Close'].shift(-2)
df_day_5_1['Close (t-2)'] = df['Close'].shift(-3)
df_day_5_1['Close (t-1)'] = df['Close'].shift(-4)

# y-variable (close price)
df_day_5_1['Close (t)'] = df['Close'].shift(-5)
df_day_5_1 = df_day_5_1.dropna()

# y-variable (movement)
df_day_5_1['movement'] = np.where(df_day_5_1["Close (t)"] > df_day_5_1['Close (t-1)'], 1, 0)
df_day_5_1.to_csv('df_day_5_1.csv')

In [13]:
# split data to train and test

def split(filename, n):
    df = pd.read_csv(filename)
    train_size = round(len(df)*0.7)
    y_movement = df['movement']
    y_t = df['Close (t)']

    if n == 1:
        t_n = ['t-1']
    else:
        t_n = ['Volume (t-5)', 'Volume (t-4)', 'Volume (t-3)', 'Volume (t-2)', 'Volume (t-1)',\
                   'Close (t-5)', 'Close (t-4)', 'Close (t-3)', 'Close (t-2)', 'Close (t-1)']
    X = df[t_n]

    # for movement
    X_train, X_test, y_train_movement, y_test_movement = X[:train_size], X[train_size:], y_movement[:train_size], y_movement[train_size:]

    # for t
    X_train, X_test, y_train_t, y_test_t = X[:train_size], X[train_size:], y_t[:train_size], y_t[train_size:]
    train_df = pd.concat([X_train,y_train_t,y_train_movement],axis = 1)
    test_df = pd.concat([X_test,y_test_t, y_test_movement],axis = 1)
    return train_df, test_df

def split2(filename, n):
    df = pd.read_csv(filename)
    Y_mov = df["movement"]
    Y_t = df["Close (t)"]

    if n == 1:
        t_n = ['t-1']
    else:
        t_n = ['Volume (t-5)', 'Volume (t-4)', 'Volume (t-3)', 'Volume (t-2)', 'Volume (t-1)',\
                   'Close (t-5)', 'Close (t-4)', 'Close (t-3)', 'Close (t-2)', 'Close (t-1)']
    X = df[t_n]

    # for movement
    X_train, X_test, y_train_movement, y_test_movement = train_test_split(X, Y_mov, test_size=0.3, random_state=42)

    # for t
    X_train, X_test, y_train, y_test = train_test_split(X, Y_t, test_size=0.3, random_state=42)
    train_df = pd.concat([X_train,y_train_t,y_train_movement],axis = 1)
    test_df = pd.concat([X_test,y_test_t, y_test_movement],axis = 1)
    return train_df, test_df

In [10]:
train, test = split('df_day_5_1.csv', 5)
train.to_csv('train_df_day_5_1.csv', index = False)
test.to_csv('test_df_day_5_1.csv', index = False)

In [14]:
train2, test2 = split('df_day_5_1.csv', 5)
train2.to_csv('v2_train_df_day_5_1.csv', index = False)
test2.to_csv('v2_test_df_day_5_1.csv', index = False)