<a href="https://colab.research.google.com/github/yunjiangster/trading/blob/main/notebooks/basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os, math, sys, re
import requests

In [None]:
os.getcwd()

In [None]:
header = requests.get('https://raw.githubusercontent.com/yunjiangster/trading/main/data/eth/header.csv').content.decode().split('\n')[0].split(',')

df = pd.read_csv('https://raw.githubusercontent.com/yunjiangster/trading/main/data/eth/ETHUSDT-201708xx-20220921.csv', header=None)
df.columns = header

In [None]:
df["Time"] = pd.to_datetime(df['Open time'], unit='ms')
df.plot(x="Time", y=["Open", "Volume"])
df.Time

In [None]:
from typing import Callable, Dict

# TODO: expand later to output predictions.
MetricsType = Dict[str, float]
def cross_validate(
    df, 
    train_algo: Callable[[pd.DataFrame], Callable[[pd.DataFrame], MetricsType]], 
    fold=2, forward_only=False):
  n = len(df.index)
  m = n // fold
  dfs = [df.iloc[i * m: (i + 1) * m] for i in range(fold)]
  test_metrics = []
  for j in range(fold):
    if forward_only:
      if j == 0:
        continue
      train_df = pd.concat(dfs[:j], axis=0, ignore_index=True)
    else:
      train_df = pd.concat([dfi for i, dfi in enumerate(dfs) if i != j], axis=0, 
                           ignore_index=True)
    test_df = dfs[j].reset_index()
    model = train_algo(train_df)
    test_metrics.append(model(test_df))
  
  aggregate_metrics = {}
  for k in test_metrics[0]:
    aggregate_metrics[k] = sum(tm[k] for tm in test_metrics) / len(test_metrics)

  return aggregate_metrics

In [None]:
from functools import partial
def flat_forward(train_df):
  def featurize(df):
    label = df.Close > df.Open
    feats = df.Close.shift(1) < df.Open.shift(1)
    return feats, label
  # def featurize(df):
  #   label = df.Open > df.Open.shift(1)
  #   feats = df.Open.shift(2) > df.Open.shift(1)
  #   return feats, label
  def algo(test_df):
    feats, label = featurize(test_df)
    return {'accuracy': sum(feats == label) / len(test_df.index)}
  return algo

cross_validate(df, flat_forward, 2, forward_only=True)

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm

def gbdt(train_df, valid_frac=0.2):
  n = len(train_df.index)
  b = int(n * (1 - valid_frac))
  def featurize(df):
    label = df.Close.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])
    # label = ((df.Close - df.Open) > 0).astype(float)
    features = {}

    for k in ['Close', 'Open', 'High', 'Low', 'Volume']:
      features['Prev_%s' % k] = df[k].shift(1)
      features['Prev_Delta_%s' % k] = df[k].shift(1) - df[k].shift(2)
    feats = pd.DataFrame(features)
    return feats, label
  feats, label = featurize(train_df)
  train_label, valid_label = label.iloc[:b], label.iloc[b:]
  train_feats, valid_feats = feats.iloc[:b], feats.iloc[b:]
  train_data = lightgbm.Dataset(train_feats, label=train_label)
  valid_data = lightgbm.Dataset(valid_feats, label=valid_label)

  parameters = {
      'application': 'binary',
      'objective': 'binary',
      'metric': 'auc',
      'is_unbalance': 'true',
      'boosting': 'gbdt',
      'num_leaves': 20,
      'max_depth': 2,
      'feature_fraction': 0.5,
      'bagging_fraction': 0.5,
      'bagging_freq': 20,
      'learning_rate': 0.5,
      'verbose': 0
  }
  model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=valid_data,
                       num_boost_round=100,
                       early_stopping_rounds=2)
  
  def algo(test_df):
    test_feats, test_label = featurize(test_df)
    test_preds = model.predict(test_feats)
    correct = (test_preds > 0.5) == test_label
    return {'accuracy': sum(correct) / len(test_df.index)}

  return algo

cross_validate(df, gbdt, 2, forward_only=True)

In [None]:
import sympy

sympy.factorint(len(df.index))

In [None]:
(df.Open - df.Open.shift(-1)).plot()

In [None]:
from datetime import datetime as dt
pd.set_option('display.max_rows', 1000)

df['Open time'].apply(lambda x: dt.fromtimestamp(x / 1e3).strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
(df.Open - df.Close).plot()