In [19]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import os
import neat
from pathlib import Path
notebook_path = os.getcwd()
algo_dir = Path(notebook_path).parent.parent
csv_file = str(algo_dir) + '/vn-stock-data/VN30ps/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    raw_data = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    raw_data = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)

In [20]:
def cal_first_close(tick):
  if not tick.empty:
    return tick[0]


def cal_high_before(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 910]
  tick = tick[100*tick.index.hour+tick.index.minute < 1355]
  return tick.max()

def cal_high_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.max()


def cal_low_before(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 910]
  tick = tick[100*tick.index.hour+tick.index.minute < 1355]
  return tick.min()


def cal_low_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.min()


def cal_price(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1355]
  if not tick.empty:
    return tick[0]

def cal_close(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1425]
  if not tick.empty:
    return tick[0]

In [21]:
data = raw_data.copy()
data['first_close'] = data.Close
data['price'] = data.Close
data['prev_high'] = data.High
data['prev_low'] = data.Low
data['next_high'] = data.High
data['next_low'] = data.Low
price = data.resample("D").agg({
    'first_close': cal_first_close,
    'prev_high':cal_high_before,
    'prev_low': cal_low_before,
    'next_high':cal_high_after,
    'next_low': cal_low_after,
    'price': cal_price,
    'Close': cal_close
    })
price = price.dropna()

In [22]:
price['percent'] = 100 * (price.price - price.Close.shift(1)) / price.Close.shift(1)
price['returns'] = (price.price - price.first_close) / (price.prev_high - price.prev_low)
price['return'] = 1000 * (price.Close - price.price) / price.price
price = price[price['return'] > -30]
price = price[price['return'] <  30]

In [27]:
def group_data(r):
    group = 4
    if r['next_high'] - r['price'] > 3 and r['price'] - r['next_low'] > 3:
        # Do nothing group
        group = 0
    elif r['return'] > 0:
        # Long group
        group = 1
    else:
        # Short group
        group = -1
    return group
price['group'] = price.apply(lambda r: group_data(r), axis=1)
price.dropna(inplace=True)
price

Unnamed: 0_level_0,first_close,prev_high,prev_low,next_high,next_low,price,Close,percent,returns,return,group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-08-14,954.9,958.6,952.1,961.8,959.0,959.5,960.1,0.565978,0.707692,0.625326,1
2018-08-15,958.7,962.3,954.7,959.3,949.9,959.2,951.2,-0.093740,0.065789,-8.340284,-1
2018-08-16,942.0,943.5,935.5,947.3,942.6,943.0,944.5,-0.862069,0.125000,1.590668,1
2018-08-17,953.0,954.6,946.4,949.9,945.0,946.5,947.0,0.211752,-0.792683,0.528262,1
2018-08-20,945.8,951.5,944.1,946.5,942.6,946.5,944.7,-0.052798,0.094595,-1.901743,-1
...,...,...,...,...,...,...,...,...,...,...,...
2024-07-04,1306.7,1313.3,1306.1,1309.5,1303.5,1308.3,1308.0,0.214477,0.222222,-0.229305,-1
2024-07-05,1309.7,1314.5,1307.2,1316.5,1307.6,1308.8,1316.2,0.061162,-0.123288,5.654034,1
2024-07-08,1316.1,1316.8,1305.6,1313.0,1307.0,1310.7,1312.0,-0.417870,-0.482143,0.991836,1
2024-07-09,1315.1,1320.3,1304.5,1321.9,1314.4,1318.2,1319.5,0.472561,0.196203,0.986193,0


In [33]:
_inputs = list(price[['percent', 'returns']].itertuples(index=False, name=None))
_outputs = list(price[['group']].itertuples(index=False, name=None))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(_inputs, _outputs, test_size=0.2)  # 80% training and 20% test

In [53]:
y_pred = []
def eval_genomes(genomes, config):
    for genome_id, genome in genomes:
        genome.fitness = 4000
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        for xi, xo in zip(X_train, y_train):
            predict = net.activate(xi)
            genome.fitness -= (predict[0] - xo[0]) ** 2
            # print("predict: {}, expected: {}, new fitness: {}".format(predict[0], xo[0], genome.fitness))


def run(config_file):
    # Load configuration.
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         config_file)

    # Create the population, which is the top-level object for a NEAT run.
    p = neat.Population(config)

    # Add a stdout reporter to show progress in the terminal.
    # p.add_reporter(neat.StdOutReporter(True))
    # stats = neat.StatisticsReporter()
    # p.add_reporter(stats)
    # p.add_reporter(neat.Checkpointer(5))

    # Run for up to 300 generations.
    winner = p.run(eval_genomes, 300)

    # Display the winning genome.
    print('\nBest genome:\n{!s}'.format(winner))

    # Show output of the most fit genome against training data.
    print('\nOutput:')
    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
    for xi, xo in zip(X_test, y_test):
        output = winner_net.activate(xi)
        # y_pred.append(output)
        print("input {!r}, expected output {!r}, got {!r}".format(xi, xo, output))

    # p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-4')
    # p.run(eval_genomes, 10)

In [54]:
# Determine path to configuration file. This path manipulation is
# here so that the script will run successfully regardless of the
# current working directory.
import os
from pathlib import Path
notebook_path = os.getcwd()
local_dir = Path(notebook_path)
config_path = os.path.join(local_dir, 'neat-config')
is_file = os.path.isfile(config_path)
if is_file:
    run(config_path)


Best genome:
Key: 2
Fitness: 2269.474590499535
Nodes:
	0 DefaultNodeGene(key=0, bias=-0.5445511157984552, response=1.0, activation=tanh, aggregation=sum)
Connections:
	DefaultConnectionGene(key=(-2, 0), weight=-0.15521385631507464, enabled=True)
	DefaultConnectionGene(key=(-1, 0), weight=0.892434783052413, enabled=True)

Output:
input (-0.19807041083636248, -0.17877094972067203), expected output (-1,), got [-0.9395175653579383]
input (0.273134491423577, 0.33070866141731625), expected output (1,), got [-0.7065782955024852]
input (-1.7401574803149535, -0.4612903225806437), expected output (0,), got [-0.9999202441404506]
input (-0.0639211030955911, 0.17391304347826517), expected output (1,), got [-0.9172601607887473]
input (0.17251877410189217, -0.15789473684208322), expected output (1,), got [-0.7236210173898461]
input (-0.28810659944179756, -0.850467289719614), expected output (0,), got [-0.9320992820349095]
input (-0.35996284254528826, 0.11764705882352351), expected output (1,), got [

In [43]:
# from sklearn import metrics
# # Model Accuracy, how often is the classifier correct?
# print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
# print("R2_score:", metrics.r2_score(y_test, y_pred))