# Full Process

In [1]:
import pandas as pd

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

----

In [3]:
df_AAPL = get_data('files/datasets/AAPL.csv', 2016)
df_MSFT = get_data('files/datasets/MSFT.csv', 2016)
df_AMZN = get_data('files/datasets/AMZN.csv', 2016)
df_TSLA = get_data('files/datasets/TSLA.csv', 2016)
df_GOOGL = get_data('files/datasets/GOOGL.csv', 2016)
df_BRKB = get_data('files/datasets/BRK-B.csv', 2016)
df_UNH = get_data('files/datasets/UNH.csv', 2016)
df_GOOG = get_data('files/datasets/GOOG.csv', 2016)
df_XOM = get_data('files/datasets/XOM.csv', 2016)
df_JNJ = get_data('files/datasets/JNJ.csv', 2016)

AAPL - Apple
<br> MSFT - Microsoft 
<br> AMZN - Amazon
<br> TSLA - Tesla
<br> GOOGL - Alphabet (Class A)
<br> BRK-B - Berkshire Hathaway (Class B)
<br> UNH - UnitedHealth Group
<br> GOOG - Alphabet (Class C)
<br> XOM - Exxon Mobil
<br> JNJ - Johnson and Johnson

### Process 1:

In [3]:
def get_data(path, year):
    import pandas as pd
    df = pd.read_csv(path)
    
    # Convert date into datetime data type
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Get 2019 data only and fix the index
    df_idx = df[(df['Date'].dt.year >= year)].copy()
    df_idx.index = range(len(df_idx))
    
    return df_idx

### Process 2:

In [5]:
def ma_indicators(dataframe):
    
    # Create exponential moving average
    dataframe['EMA_9'] = dataframe['Close'].ewm(9).mean().shift()
    
    # Create 4 simple moving average using the values below
    for n in [5, 10, 15, 30]:
        dataframe[f'SMA_{n}'] = dataframe['Close'].rolling(n).mean().shift()
        
    return dataframe

#df = ma_indicators(df)

### Process 3:

In [6]:
def add_rsi_indicator(dataframe, n=14):
    def relative_strength_idx():
        close = dataframe['Close']
        delta = close.diff()
        delta = delta[1:]
        pricesUp = delta.copy()
        pricesDown = delta.copy()
        pricesUp[pricesUp < 0] = 0
        pricesDown[pricesDown > 0] = 0
        rollUp = pricesUp.rolling(n).mean()
        rollDown = pricesDown.abs().rolling(n).mean()
        rs = rollUp / rollDown
        rsi = 100.0 - (100.0 / (1.0 + rs))
        return rsi.fillna(0)

    dataframe['RSI'] = relative_strength_idx()
    return dataframe

### Process 4:

In [7]:
def macd_indicator(dataframe):
    
    # Define the two moving average that MACD will use
    EMA_12 = pd.Series(dataframe['Close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(dataframe['Close'].ewm(span=26, min_periods=26).mean())
    
    # Create MACD Indicator
    dataframe['MACD'] = pd.Series(EMA_12 - EMA_26)
    dataframe['MACD_signal'] = pd.Series(dataframe.MACD.ewm(span=9, min_periods=9).mean())
    
    return dataframe

### Process 5:

In [8]:
def label_shift(dataframe):
    dataframe['Close'] = dataframe['Close'].shift(-1)
    return dataframe

### Process 6:

In [9]:
def drop_invalid_samples(dataframe):
    dataframe = dataframe.iloc[33:] # Because of moving averages and MACD line
    dataframe = dataframe[:-1]      # Because of shifting close price
    
    dataframe.index = range(len(dataframe))
    
    return dataframe

In [10]:
pipe_clean = (df.pipe(ma_indicators)
                .pipe(add_rsi_indicator)
                .pipe(macd_indicator)
                .pipe(label_shift)
                .pipe(drop_invalid_samples)
             )
pipe_clean

NameError: name 'df' is not defined

# STOP HERE

----

In [None]:
test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(pipe_clean.shape[0] * (1-test_size))
valid_split_idx = int(pipe_clean.shape[0] * (1-(valid_size+test_size)))

train_df  = pipe_clean.loc[:valid_split_idx].copy()
valid_df  = pipe_clean.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = pipe_clean.loc[test_split_idx+1:].copy()

In [None]:


new_data = label_shift(new_data)

In [None]:
drop_invalid_samples(new_data)

In [None]:
# Shift Labels

df_2019['Close'] = df_2019['Close'].shift(-1)
df_2019['Close'].head()

In [None]:
# Drop Invalid Samples

df_2019 = df_2019.iloc[33:] # Because of moving averages and MACD line
df_2019 = df_2019[:-1]      # Because of shifting close price

df_2019.index = range(len(df_2019))

df_2019

In [None]:
# Split Data into Train, Valid and Test Sets

test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df_2019.shape[0] * (1-test_size))
valid_split_idx = int(df_2019.shape[0] * (1-(valid_size+test_size)))

train_df  = df_2019.loc[:valid_split_idx].copy()
valid_df  = df_2019.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df_2019.loc[test_split_idx+1:].copy()

In [None]:
# Drop unecessary columns

drop_cols = ['Date', 'Volume', 'Open', 'Low', 'High', 'Adj Close']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)

In [None]:
y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_valid = valid_df['Close'].copy()
X_valid = valid_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

In [None]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [300, 400, 500, 600, 700],
    'learning_rate': [0.01, 0.05, 0.08, 0.10, 0.15],
    'max_depth': [5, 8, 10, 12, 15, 18],
    'gamma': [0.01, 0.02, 0.05, 0.08],
    'random_state': [42]
}

eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbosity = 0, booster='gblinear')
clf = GridSearchCV(model, parameters)

clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

In [None]:
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror', 
                         verbosity = 0, 
                         booster='gblinear')
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

In [None]:
import numpy as np

y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


predicted_prices = df_2019.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()

In [None]:
from sklearn.metrics import mean_squared_error

print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

# SHET AHAHAH IM SO PROUD

In [None]:
y_pred_df = pd.DataFrame(y_pred)

In [None]:
date_preds = pd.DataFrame(predicted_prices.Date)

In [None]:
result = pd.concat([predicted_prices.Date, y_pred_df], axis=1)

In [None]:
y_pred_df.to_csv("AAPL_predictions.csv", index=False)

In [None]:
date_preds.to_csv("AAPL_date_preds.csv", index=False)