In [1]:
import pandas as pd
import yfinance as yf

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

---
# Full Process

In [2]:
# Process 1

def yfinance_data(ticker_symbol, start_date):
    
    # Import the necessary libraries
    import yfinance as yf
    import pandas as pd
    
    # Load data from Yahoo Finance
    ticker = yf.Ticker(ticker_symbol)
    ticker_df = ticker.history(start=start_date)
    
    # Remove the time from the index
    ticker_df.index = pd.to_datetime(ticker_df.index)
    ticker_df.index = ticker_df.index.date
    
    return ticker_df

df = yfinance_data('AAPL', '2019-1-1')
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits
2019-01-02,37.374017,38.329542,37.214762,38.105137,148158800,0.0,0.0
2019-01-03,34.741504,35.161357,34.263743,34.309589,365248800,0.0,0.0
2019-01-04,34.874217,35.84422,34.698073,35.774242,234428400,0.0,0.0
2019-01-07,35.880404,35.911774,35.20478,35.694607,219111200,0.0,0.0
2019-01-08,36.087921,36.633247,35.836977,36.375061,164101200,0.0,0.0


In [3]:
# Process 5

# Create RSI Indicator
def relative_strength_idx(df, n=14):
    close = df['Close']
    delta = close.diff()
    delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

df['RSI'] =  relative_strength_idx(df).fillna(0)

---

In [4]:
# Process 1

def simple_ma(dataframe, sma_vals):
    for i in sma_vals:
        dataframe[f'SMA_{i}'] = dataframe['Close'].rolling(i).mean().shift()
    return dataframe

In [5]:
# Process 2

def exponential_ma(dataframe, ema_vals):
    for i in ema_vals:
        dataframe[f'EMA_{i}'] = dataframe['Close'].ewm(i).mean().shift()
    return dataframe


In [6]:
# Process 3

# Create MACD Indicator
def macd(dataframe, ema_low, ema_high, macd_value):
    EMA_low = pd.Series(dataframe['Close'].ewm(span=ema_low, min_periods=ema_low).mean())
    EMA_high = pd.Series(dataframe['Close'].ewm(span=ema_high, min_periods=ema_high).mean())
    dataframe['MACD'] = pd.Series(EMA_low - EMA_high)
    dataframe['MACD_signal'] = pd.Series(dataframe.MACD.ewm(span=macd_value, min_periods=macd_value).mean())
    return dataframe

In [7]:
# Process 4

# Shift Labels
def move_up_data(dataframe, move_value):
    dataframe['Close'] = dataframe['Close'].shift(move_value)
    return dataframe

In [8]:
# Process 5

# Remove Rows
def remove_rows(dataframe, first_n_row):
    dataframe = dataframe.iloc[first_n_row:]
    dataframe = dataframe[:-1] # Remove last row
    
    return dataframe

In [9]:
# Process 6

# Delete Columns
def delete_columns(dataframe, cols_list):
    new_dataframe = dataframe.drop(columns = cols_list, axis=1)
    return new_dataframe

In [10]:
del_cols = ['Dividends', 'Stock Splits', 'Volume', 'Open', 'Low', 'High']

# Pandas Pipe
piped_df = (df.pipe(simple_ma, [5,10,15,30])
              .pipe(exponential_ma, [5,9,18])
              .pipe(macd, 12, 26, 9)
              .pipe(move_up_data, -1)
              .pipe(remove_rows, 33).pipe(delete_columns, del_cols))

piped_df.head()

Unnamed: 0,Close,RSI,SMA_5,SMA_10,SMA_15,SMA_30,EMA_5,EMA_9,EMA_18,MACD,MACD_signal
2019-02-20,41.452774,70.567749,41.351955,41.443122,40.88608,38.964359,40.996018,40.416987,39.699453,0.851216,0.906074
2019-02-21,41.915615,64.822103,41.407206,41.409046,41.177045,39.164133,41.111553,40.547706,39.823902,0.814685,0.885598
2019-02-22,42.22094,68.071882,41.449858,41.350018,41.282308,39.33339,41.16852,40.640536,39.924846,0.811535,0.869393
2019-02-25,42.245178,61.340416,41.555028,41.416901,41.399285,39.497485,41.293212,40.770983,40.047076,0.822796,0.859386
2019-02-26,42.37603,53.299345,41.739681,41.50947,41.53533,39.667816,41.448015,40.918979,40.179388,0.823989,0.851895


```Python
df = simple_ma(df, [5,10,15,30])
df = exponential_ma(df, [5,9,18])
df = macd(df, 12, 26, 9)
df = move_up_data(df, -1)
df = remove_rows(df, 33)
df = delete_columns(df, ['Dividends', 'Stock Splits', 'Volume', 
                         'Open', 'Low', 'High'])
```

---

## New Set of Functions

In [29]:
# Split data for XGBoost
def xgb_datasplit(dataframe, valid_size, test_size):

    test_split_idx  = int(dataframe.shape[0] * (1-test_size))
    valid_split_idx = int(dataframe.shape[0] * (1-(valid_size+test_size)))

    train_df  = df.iloc[:valid_split_idx].copy()
    valid_df  = df.iloc[valid_split_idx+1:test_split_idx].copy()
    test_df   = df.iloc[test_split_idx+1:].copy()
        
    return train_df, valid_df, test_df

df_train, df_valid, df_test = xgb_datasplit(piped_df, 0.15, 0.15)

In [32]:
for i in [df_train, df_valid, df_test]:
    print(i.isnull().sum())

Open             0
High             0
Low              0
Close            0
Volume           0
Dividends        0
Stock Splits     0
RSI              1
SMA_5            5
SMA_10          10
SMA_15          15
SMA_30          30
EMA_5            1
EMA_9            1
EMA_18           1
MACD            25
MACD_signal     33
dtype: int64
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
RSI             0
SMA_5           0
SMA_10          0
SMA_15          0
SMA_30          0
EMA_5           0
EMA_9           0
EMA_18          0
MACD            0
MACD_signal     0
dtype: int64
Open            0
High            0
Low             0
Close           1
Volume          0
Dividends       0
Stock Splits    0
RSI             0
SMA_5           0
SMA_10          0
SMA_15          0
SMA_30          0
EMA_5           0
EMA_9           0
EMA_18          0
MACD            0
MACD_signal     0
dtype: int64


In [12]:
def xy_split(dataframe, y_column):
    features = dataframe.loc[:, dataframe.columns != y_column]
    target = dataframe[y_column]
    
    # X_train, y_train
    return features, target

X_train, y_train = xy_split(df_train, 'Close')
X_valid, y_valid = xy_split(df_valid, 'Close')
X_test, y_test = xy_split(df_test, 'Close')

----

## NEW MODEL

In [13]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'max_depth': [8, 10, 12, 15],
    'gamma': [0.001, 0.005, 0.01, 0.02],
    'random_state': [42]
}

eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbosity = 0, booster='gblinear')
clf = GridSearchCV(model, parameters)

clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

Best params: {'gamma': 0.01, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 400, 'random_state': 42}
Best validation score = 0.7054185153167308
Wall time: 1min 14s


In [None]:
best_params = {'gamma': 0.001, 
               'learning_rate': 0.05, 
               'max_depth': 12, 
               'n_estimators': 100, 
               'random_state': 42}

In [14]:
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror', 
                         verbosity = 0, 
                         booster='gblinear')
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

Wall time: 168 ms


In [15]:
import numpy as np

y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')

y_true = [164.32659912 166.64608765 166.47685242 165.67050171 161.06135559]
y_pred = [167.77864 165.53763 165.49802 166.18657 166.14642]


In [19]:
piped_df.isnull().sum()

Close          0
RSI            0
SMA_5          0
SMA_10         0
SMA_15         0
SMA_30         0
EMA_5          0
EMA_9          0
EMA_18         0
MACD           0
MACD_signal    0
dtype: int64

In [21]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


predicted_prices = piped_df.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()

TypeError: cannot do slice indexing on Index with these indexers [828] of type int

In [25]:
def null_cols(dataframe):
    null_columns = dataframe.columns[dataframe.isna().any()]
    return null_columns.tolist()

In [18]:
from sklearn.metrics import mean_squared_error

print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

ValueError: Input contains NaN.

In [None]:
SHET AHAHAH IM SO PROUD