In [21]:
import pandas as pd
import numpy as np

from stable_baselines import A2C, DQN
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv

from dmarket.environments import SingleAgentTrainingEnv
from dmarket.agents import UniformRandomAgent, GymRLAgent
from dmarket.info_settings import OfferInformationSetting
from dmarket.engine import MarketEngine

Let's create a training environment for baselines with 5 fixed buyers and sellers and 1 RL agent to be trained

In [22]:
fixed_agents = [
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('seller', 90),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
    UniformRandomAgent('buyer', 110),
]

rl_agent = GymRLAgent('buyer', 110, discretization=20)
setting = OfferInformationSetting(5)

def get_env(rl_agent, fixed_agents, setting):
    return SingleAgentTrainingEnv(rl_agent, fixed_agents, setting)

env = DummyVecEnv([lambda: get_env(rl_agent, fixed_agents, setting)]) # wrap it for baselines

Initialize a deep-Q RL model with MLP policy and high learning rate:

In [23]:
model = DQN("MlpPolicy", env, verbose=1, learning_rate=0.05)

In [24]:
%%time
model.learn(total_timesteps=10000)

--------------------------------------
| % time spent exploring  | 48       |
| episodes                | 100      |
| mean 100 episode reward | 6.3      |
| steps                   | 527      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 5        |
| episodes                | 200      |
| mean 100 episode reward | 5.8      |
| steps                   | 967      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 300      |
| mean 100 episode reward | 7.3      |
| steps                   | 1124     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 400      |
| mean 100 episode reward | 10.5     |
| steps                   | 1380     |
--------------------------------------
--------------------------------------
| % time spent exploring 

<stable_baselines.deepq.dqn.DQN at 0x7f7fc838f490>

Let's put this agent to the test

In [25]:
rl_agent.model = model

In [26]:
def get_reward(agent, deals):
    if not agent.name in deals:
        return 0

    deal_price = deals[agent.name]
    if agent.role == 'buyer':
        return agent.reservation_price - deal_price
    else:
        return deal_price - agent.reservation_price

def play_games(agents, setting, n_games=100, max_steps=30):
    buyer_ids =  [
        agent.name
        for agent in agents
        if agent.role == 'buyer'
    ]
    seller_ids =  [
        agent.name
        for agent in agents
        if agent.role == 'seller'
    ]
    ids = set(buyer_ids + seller_ids)
    market = MarketEngine(buyer_ids, seller_ids, max_steps=max_steps)
    
    rewards = pd.DataFrame(0, index=np.arange(n_games), columns=ids)
    for game_idx in range(n_games):
        while market.done != ids:
            observations = setting.get_states(ids, market)
            unmatched_agents = [
                agent for agent in agents
                if agent.name not in market.done
            ]
            offers = {
                agent.name: agent.get_offer(observations[agent.name])
                for agent in unmatched_agents
            }
            deals = market.step(offers)
            for agent in unmatched_agents:
                rewards[agent.name][game_idx] = get_reward(agent, deals)
        market.reset()
    return rewards

In this simple situation, the agent can get decent rewards:

In [27]:
play_games(fixed_agents + [rl_agent], setting, 10)

Unnamed: 0,Unif_S90_da0e,Unif_B110_8441,Unif_S90_78b0,Unif_S90_5c86,Unif_B110_ed92,GymR_B110_9e2b,Unif_B110_1200,Unif_S90_d60b,Unif_S90_499d,Unif_B110_5ed6,Unif_B110_8e30
0,11,14,5,10,9,15,8,6,4,0,13
1,11,0,15,5,8,14,4,9,5,10,14
2,6,11,8,9,13,15,1,4,18,10,0
3,10,0,10,6,9,16,13,3,10,9,9
4,11,10,8,3,13,16,8,9,6,0,11
5,7,10,6,7,12,15,12,9,4,13,0
6,10,3,12,7,7,0,9,4,16,12,15
7,10,6,13,6,0,13,9,9,5,14,10
8,11,15,7,4,5,0,12,11,14,8,8
9,15,5,10,3,16,16,4,14,3,0,9


In [28]:
play_games(fixed_agents + [rl_agent], setting, 100).describe()

Unnamed: 0,Unif_S90_da0e,Unif_B110_8441,Unif_S90_78b0,Unif_S90_5c86,Unif_B110_ed92,GymR_B110_9e2b,Unif_B110_1200,Unif_S90_d60b,Unif_S90_499d,Unif_B110_5ed6,Unif_B110_8e30
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,8.36,6.9,8.7,8.83,7.72,13.39,8.06,7.99,8.92,7.76,7.61
std,3.825803,5.244526,3.597137,3.819289,4.813869,4.334254,4.737173,3.445081,4.046897,4.807654,4.18727
min,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,5.75,0.0,6.0,5.75,5.0,14.0,5.0,5.0,6.0,4.0,5.0
50%,8.0,8.0,9.0,9.0,9.0,15.0,9.0,8.0,9.0,9.0,9.0
75%,11.0,11.0,11.0,11.25,11.0,15.0,11.0,10.0,11.25,11.25,10.25
max,17.0,18.0,18.0,18.0,19.0,16.0,18.0,19.0,17.0,16.0,17.0


Let's see if it can act as a seller as well:

In [29]:
rl_seller = GymRLAgent('seller', 90, model=model, name='myseller')

In [30]:
play_games(fixed_agents + [rl_seller], setting, 100).describe()

Unnamed: 0,Unif_S90_da0e,Unif_B110_8441,Unif_S90_78b0,myseller,Unif_S90_5c86,Unif_B110_ed92,Unif_B110_1200,Unif_S90_d60b,Unif_S90_499d,Unif_B110_5ed6,Unif_B110_8e30
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,7.36,9.29,7.08,11.67,6.78,9.29,9.05,8.05,7.07,9.77,9.4
std,4.639575,3.382352,4.423719,3.853517,4.569508,3.343303,3.854356,4.515685,5.19762,3.979379,3.200379
min,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,1.0
25%,3.75,7.0,5.0,12.0,3.0,7.0,6.0,5.0,0.75,6.0,7.0
50%,8.0,9.0,8.0,13.0,7.0,9.0,9.0,9.0,8.0,10.0,9.0
75%,10.25,11.0,10.0,14.0,10.0,12.0,11.25,11.0,11.25,13.0,11.0
max,16.0,18.0,14.0,15.0,19.0,17.0,18.0,18.0,17.0,18.0,19.0


We see it performs slightly worse.