<a href="https://colab.research.google.com/github/tonchan1216/bitbot/blob/master/notebook/q_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import random
from google.colab import drive
import copy
import math

from datetime import datetime
import pickle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, ReLU
from tensorflow.keras.optimizers import RMSprop
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.utils import Progbar

In [14]:
mode = 'train'
name = 'qlearning'
level = 1
if level == 2:
    name += name + 'lv2'

drive.mount('/content/drive/')
nov_dir = 'Colab Notebooks/training_data/'
nov_path = '/content/drive/My Drive/' + nov_dir + f'btc_{mode}.csv'

exp_dir = 'Colab Notebooks/'
mdl_dir = '/content/drive/My Drive/' + exp_dir + 'models'
csv_path = '/content/drive/My Drive/' + exp_dir + f'csv_data/{name}_{mode}.csv'

df = pd.read_csv(nov_path)
df['Date'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')
df.head()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Unnamed: 0,Date,BTC
0,2015-06-26,32000
1,2015-06-27,31700
2,2015-06-28,31000
3,2015-06-29,30870
4,2015-06-30,31550


In [15]:
class Environment:
    def __init__(self, df, initial_money=100000, mode = 'test', commission = 0, min_trade_btc = 0.005):
        self.df = df.dropna().reset_index()

        self.df_total_steps  = len(self.df)-1
        self.initial_money   = initial_money
        self.mode            = mode
        self.commission      = commission
        self.min_trade_btc   = min_trade_btc
        self.trade_time      = None
        self.trade_win       = None
        self.before_buy_cash = None
        self.action_space    = np.array([0, 1, 2]) # buy,hold,sell
        self.hold_a_position = None
        self.now_price       = None
        self.cash_in_hand    = None
        self.sell_price      = None
        self.buy_price       = None

        self.reset()
        
    def reset(self):
        self.trade_time      = 0
        self.trade_win       = 0
        self.before_buy_cash = 0
        self.end_step        = self.df_total_steps
        self.now_step        = 0
        self.hold_a_position = 0.0
        self.now_price       = self._get_now_price()
        self.cash_in_hand    = self.initial_money
        self.sell_price      = 0
        self.buy_price       = 0

        return self._get_now_state()

    
    def step(self, action):
        self.now_step += 1 # 日付の更新
        self.now_price = self._get_now_price() # レートの更新
 
        done = (self.end_step == self.now_step) # 終了判定

        self.sell_price = 0 
        self._trade(action,done)
        reward = 0

        if (self.sell_price > 0) and (self.buy_price > 0) and ((self.sell_price - self.buy_price) != 0):
            reward = (self.sell_price - self.buy_price) / self.buy_price # 損益率が報酬
            self.buy_price = 0
        cur_revenue = self._get_revenue()
 
        info = { 'cur_revenue' : cur_revenue , 'trade_time' : self.trade_time, 'trade_win' : self.trade_win }

        return self._get_now_state(), reward, done, info

    # 価格の取得
    def _get_now_price(self):
        return self.df.loc[self.now_step, 'BTC']

    # 状態の取得
    def _get_now_state(self):
        state = np.empty(3)
        state[0] = self.hold_a_position ## 保有するポジション
        state[1] = self.now_price ## 現在のレート
        state[2] = self.cash_in_hand ## 保有する日本円現金
        return state

    #利益の評価
    def _get_revenue(self):
        return self.hold_a_position * self.now_price + self.cash_in_hand

    # 取引実行
    def _trade(self, action,lastorder = False):
        # 最終取引は全て売却
        if lastorder:
            if self.hold_a_position != 0:
                self.cash_in_hand += self.now_price * self.hold_a_position
                self.hold_a_position = 0
                self.trade_time += 1
                if self.cash_in_hand > self.before_buy_cash:
                    self.trade_win += 1
        else:
            # 買い
            if self.action_space[0] == action and self.hold_a_position == 0:
                  self.before_buy_cash = copy.copy(self.cash_in_hand)
                  if self.cash_in_hand < self.now_price * self.min_trade_btc + self.commission * self.now_price:
                    return

                  self.hold_a_position = math.floor(((self.cash_in_hand/self.now_price) - self.commission) * 10000) / 10000 
                  self.buy_price += self.now_price * self.hold_a_position
                  self.cash_in_hand -= self.now_price * self.hold_a_position + self.commission * self.now_price
            # 売り
            if self.action_space[2] == action and self.hold_a_position != 0:
                  self.sell_price += self.now_price * self.hold_a_position
                  self.cash_in_hand += self.now_price * self.hold_a_position - self.commission * self.now_price
                  self.hold_a_position = 0
                  self.trade_time += 1
                  if self.cash_in_hand > self.before_buy_cash:
                      self.trade_win += 1


In [16]:
class Brain:
    def __init__(self):

        self.gamma = 0.9
        optimizer = RMSprop()

        model = Sequential()
        model.add(Dense(3, input_shape=(3,)))
        model.add(ReLU()) 
        model.add(Dense(3))
        model.add(ReLU())
        model.add(Dense(3))
        model.compile(loss="mse", optimizer=optimizer)

        print((model.summary()))
        self.model = model

    def train(self, state, action, reward, next_state, done):
        q = self.model.predict(state)  
        next_q = self.model.predict(next_state)
        target = np.copy(q)
        if done:
            target[:, action] = reward
        else:
            target[:, action] = reward + self.gamma*np.max(next_q, axis=1)
        self.model.train_on_batch(state, target)

    def predict(self, state):
        return self.model.predict(state)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [17]:
class Agent(Brain):
    def __init__(self):

        super().__init__()
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.r = 0.995

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(3)
        act_values = self.predict(state)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.r
        return np.argmax(act_values[0])

In [18]:
class Main:
    def __init__(self, env, agent, mdl_dir, name, episodes_times = 1000, mode = 'test'):
        self.env = env
        self.agent = agent
        self.mdl_dir = mdl_dir
        self.scaler = self._standard_scaler(self.env)
        self.episodes_times = episodes_times
        self.mode = mode
        self.name = name

        self.df_rec = pd.DataFrame(index=[], columns=['FixedProfit','TradeTimes','TradeWin'])

        if self.mode == 'test':
            self._load()
            self.agent.epsilon = 0.01

    def play_game(self):

        for episode in range(self.episodes_times):

            if (episode % 10 == 0):
                metrics_names = ['FixedProfit','TradeTimes','TradeWin']
                if (int(str(self.episodes_times)[:-1])*10 == episode):
                    pb_i = Progbar(int(str(self.episodes_times)[-1]), stateful_metrics=metrics_names)
                else:
                    pb_i = Progbar(10, stateful_metrics=metrics_names)
                p_mean,trade_time,win_time = np.array([]),np.array([]),np.array([])

            state = self.env.reset()
            state = self.scaler.transform([state])
            done = False
        
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = self.scaler.transform([next_state])
                reward = self._reward_clipping(reward)

                if self.mode == 'train':
                    self.agent.train(state, action, reward, next_state, done)

                state = next_state
            
            record = pd.Series([info['cur_revenue'],info['trade_time'],info['trade_win']], index=self.df_rec.columns)
            self.df_rec = self.df_rec.append(record, ignore_index=True)
            p_mean,trade_time,win_time = np.append(p_mean,info['cur_revenue']),np.append(trade_time,info['trade_time']),np.append(win_time,info['trade_win'])
            values=[('FixedProfit',int(np.mean(p_mean))), ('TradeTimes',int(np.mean(trade_time))), ('TradeWin',int(np.mean(win_time)))]
            pb_i.add(1, values=values)

        if self.mode == 'train':
            self._save()
        self._save_csv()

    def _standard_scaler(self, env):
        states = []
        for _ in range(env.df_total_steps):
            action = np.random.choice(env.action_space)
            state, reward, done, info = env.step(action)
            states.append(state)
            if done:
                break
        
        scaler = StandardScaler()
        scaler.fit(states)
        return scaler

    def _reward_clipping(self, val):
        # valが0より大きいなら1、0なら0、0未満なら-1
        result = 1 if val > 0 else 0 if val == 0 else -1
        return result

    def _load(self):
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'rb') as f:
            self.scaler = pickle.load(f)
        self.agent.load('{}/{}.h5'.format(self.mdl_dir, self.name))

    def _save(self):
        self.agent.save('{}/{}.h5'.format(self.mdl_dir, self.name))
        with open('{}/{}.pkl'.format(self.mdl_dir, self.name), 'wb') as f:
            pickle.dump(self.scaler, f)

    def _save_csv(self):
        self.df_rec.to_csv(csv_path)

In [None]:
initial_money=100000
episodes_times = 100
commission = 0 if level == 1 else 0.002

agent = Agent()
env = Environment(df, initial_money=initial_money, mode = mode, commission = commission)
main = Main(env, agent, mdl_dir, name, episodes_times, mode)
main.play_game()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 3)                 12        
                                                                 
 re_lu_2 (ReLU)              (None, 3)                 0         
                                                                 
 dense_4 (Dense)             (None, 3)                 12        
                                                                 
 re_lu_3 (ReLU)              (None, 3)                 0         
                                                                 
 dense_5 (Dense)             (None, 3)                 12        
                                                                 
Total params: 36
Trainable params: 36
Non-trainable params: 0
_________________________________________________________________
None