<a href="https://colab.research.google.com/github/yunjiangster/trading/blob/main/notebooks/logistic_trading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import datetime
import numpy as np
import pandas as pd
import sklearn
from pandas_datareader.data import DataReader
# from pandas.io.data import DataReader
from sklearn.linear_model import LogisticRegression
# from sklearn.lda import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
# from sklearn.qda import QDA

In [None]:
a = pd.DataFrame({'a': [1,2,3]})

In [None]:
a.index = ['2', '3', '4']

In [None]:
stock_ticker = '^GSPC'
start_date, end_date = '2001-01-10', '2005-12-31'
start, end = map(date_str_to_ts, [start_date, end_date])
url = ('https://query1.finance.yahoo.com/v7/finance/download/' + 
           stock_ticker + '?period1=' + str(start) + '&period2=' + str(end) + 
           '&interval=1d&events=history')
ts = pd.read_csv(url)

In [None]:
import numpy as np
import pandas as pd
import os, math, sys, re
import requests
header = requests.get('https://raw.githubusercontent.com/yunjiangster/trading/main/data/eth/header.csv').content.decode().split('\n')[0].split(',')

df = pd.read_csv('https://raw.githubusercontent.com/yunjiangster/trading/main/data/eth/ETHUSDT-201708xx-20220921.csv', header=None)
df.columns = header

In [None]:
df['Adj Close'] = df['Close']

In [None]:
from datetime import datetime as dt
df['Date'] = df['Open time'].apply(lambda x: dt.fbromtimestamp(x / 1e3).strftime('%Y-%m-%d'))

In [None]:
def date_str_to_ts(date_str):
  if isinstance(date_str, (list, tuple)):
    return (pd.to_datetime(date_str).astype(int) // 10**9).to_list()
  return pd.to_datetime([date_str]).astype(int)[0] // 10**9

In [None]:
def create_lagged_series(symbol, start_date, end_date, lags=5, ts=None):
    """This creates a pandas DataFrame that stores the percentage returns of the 
    adjusted closing value of a stock obtained from Yahoo Finance, along with 
    a number of lagged returns from the prior trading days (lags defaults to 5 days).
    Trading volume, as well as the Direction from the previous day, are also included."""

    # Obtain stock information from Yahoo Finance
    # ts = DataReader(symbol, "yahoo", start_date-datetime.timedelta(days=365), end_date)
    stock_ticker = symbol
    # start = pd.to_datetime(['2007-01-01']).astype(int)[0]//10**9 # convert to unix timestamp.
    # end = pd.to_datetime(['2020-12-31']).astype(int)[0]//10**9 # convert to unix timestamp.
    start = date_str_to_ts(start_date)
    end = date_str_to_ts(end_date)
    if ts is None:
      url = ('https://query1.finance.yahoo.com/v7/finance/download/' + 
            stock_ticker + '?period1=' + str(start) + '&period2=' + str(end) + 
            '&interval=1d&events=history')
      ts = pd.read_csv(url)
    ts.index = ts.Date

    # # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]

    # Create the shifted lag series of prior trading period close values
    for i in range(0,lags):
        tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)
    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change()*100.0

    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in scikit-learn)
    for i,x in enumerate(tsret["Today"]):
        if (abs(x) < 0.0001):
            tsret["Today"][i] = 0.0001

    # Create the lagged percentage returns columns
    for i in range(lags):
        tsret["Lag%s" % str(i+1)] = tslag["Lag%s" % str(i+1)].pct_change()*100.0

    # Create the "Direction" column (+1 or -1) indicating an up/down day
    tsret["Direction"] = np.sign(tsret["Today"])
    tsret = tsret[tsret.index >= start_date]

    return tsret

In [None]:
def fit_model(name, model, X_train, y_train, X_test, pred):
    """Fits a classification model (for our purposes this is LR, LDA and QDA)
    using the training data, then makes a prediction and subsequent "hit rate"
    for the test data."""

    # Fit and predict the model on the training, and then test, data
    model.fit(X_train, y_train)
    pred[name] = model.predict(X_test)

    # Create a series with 1 being correct direction, 0 being wrong
    # and then calculate the hit rate based on the actual direction
    pred["%s_Correct" % name] = (1.0+pred[name]*pred["Actual"])/2.0
    hit_rate = np.mean(pred["%s_Correct" % name])
    print("%s: %.3f" % (name, hit_rate))

In [None]:
if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    # snpret = create_lagged_series("^GSPC", '2001-01-10', '2005-12-31', lags=5)

    snpret = create_lagged_series("ETH", '2017-08-17', '2022-09-21', lags=5, ts=df)
    # Use the prior two days of returns as predictor values, with direction as the response
    X = snpret[["Lag1","Lag2", 'Lag3', 'Lag4', 'Lag5']]
    y = snpret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = '2022-01-01'
    start_train = '2017-08-%d' % (17 + 5) # exclude NaN's. 
    # Create training and test sets
    X_train = X[(X.index < start_test) & (X.index > start_train)]

    X_test = X[X.index >= start_test]
    y_train = y[(y.index < start_test) & (X.index > start_train)]
    y_test = y[y.index >= start_test]

    # Create prediction DataFrame
    pred = pd.DataFrame(index=y_test.index)
    pred["Actual"] = y_test
    
    # Create and fit the three models    
    print("Hit Rates:")
    models = [("LR", LogisticRegression()), ("LDA", LDA()), ("QDA", QDA())]
    for m in models:
        fit_model(m[0], m[1], X_train, y_train, X_test, pred)