In [23]:
## Taken from https://github.com/marload/DeepRL-TensorFlow2 ##

import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, concatenate
# import tensorflow_federated as tff

In [24]:
import gym
import argparse
import numpy as np
import random
from collections import deque

tf.keras.backend.set_floatx('float64')

# parser = argparse.ArgumentParser()
# parser.add_argument('--gamma', type=float, default=0.99)
# parser.add_argument('--actor_lr', type=float, default=0.0005)
# parser.add_argument('--critic_lr', type=float, default=0.001)
# parser.add_argument('--batch_size', type=int, default=64)
# parser.add_argument('--tau', type=float, default=0.05)
# parser.add_argument('--train_start', type=int, default=2000)

# args = parser.parse_args()



# class Args:
#     gamma = 0.99
#     actor_lr = 0.0005
#     critic_lr = 0.0005
#     batch_size = 64
#     tau = 0.05
#     train_start = 400
#     episodes = 10
#     N = 3
#     epochs = 100

# args = Args()

In [25]:
class ReplayBuffer:
    def __init__(self, capacity=20000):
        self.buffer = deque(maxlen=capacity)
    
    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])
    
    def sample(self):
        sample = random.sample(self.buffer, wandb.config.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        states = np.array(states).reshape(wandb.config.batch_size, -1)
        next_states = np.array(next_states).reshape(wandb.config.batch_size, -1)
        return states, actions, rewards, next_states, done
    
    def size(self):
        return len(self.buffer)

In [26]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.actor_lr)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(wandb.config.actor['layer1'], activation='relu'),
            Dense(wandb.config.actor['layer2'], activation='relu'),
            Dense(self.action_dim, activation='tanh'),
            Lambda(lambda x: x * self.action_bound)
        ])

    def train(self, states, q_grads):
        with tf.GradientTape() as tape:
            grads = tape.gradient(self.model(states), self.model.trainable_variables, -q_grads)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
    
    def predict(self, state):
        return self.model.predict(state)

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        return self.model.predict(state)[0]

In [27]:
class Critic:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(wandb.config.critic_lr)

    def create_model(self):
        state_input = Input((self.state_dim,))
        s1 = Dense(wandb.config.critic['state1'], activation='relu')(state_input)
        s2 = Dense(wandb.config.critic['state2'], activation='relu')(s1)
        action_input = Input((self.action_dim,))
        a1 = Dense(wandb.config.critic['actor1'], activation='relu')(action_input)
        c1 = concatenate([s2, a1], axis=-1)
        c2 = Dense(wandb.config.critic['cat1'], activation='relu')(c1)
        output = Dense(1, activation='linear')(c2)
        return tf.keras.Model([state_input, action_input], output)
    
    def predict(self, inputs):
        return self.model.predict(inputs)
    
    def q_grads(self, states, actions):
        actions = tf.convert_to_tensor(actions)
        with tf.GradientTape() as tape:
            tape.watch(actions)
            q_values = self.model([states, actions])
            q_values = tf.squeeze(q_values)
        return tape.gradient(q_values, actions)

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, actions, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model([states, actions], training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [28]:
class Agent:
    def __init__(self, env, iden = 0):
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]

        self.buffer = ReplayBuffer()

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.critic = Critic(self.state_dim, self.action_dim)
        
        self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.target_critic = Critic(self.state_dim, self.action_dim)

        actor_weights = self.actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        self.target_actor.model.set_weights(actor_weights)
        self.target_critic.model.set_weights(critic_weights)
        
        self.iden = iden
        
    
    def target_update(self):
        actor_weights = self.actor.model.get_weights()
        t_actor_weights = self.target_actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        t_critic_weights = self.target_critic.model.get_weights()

        for i in range(len(actor_weights)):
            t_actor_weights[i] = wandb.config.tau * actor_weights[i] + (1 - wandb.config.tau) * t_actor_weights[i]

        for i in range(len(critic_weights)):
            t_critic_weights[i] = wandb.config.tau * critic_weights[i] + (1 - wandb.config.tau) * t_critic_weights[i]
        
        self.target_actor.model.set_weights(t_actor_weights)
        self.target_critic.model.set_weights(t_critic_weights)


    def td_target(self, rewards, q_values, dones):
        targets = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = wandb.config.gamma * q_values[i]
        return targets

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch
    
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho * (mu-x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)
    
    def replay(self):
        for _ in range(10):
            states, actions, rewards, next_states, dones = self.buffer.sample()
            target_q_values = self.target_critic.predict([next_states, self.target_actor.predict(next_states)])
            td_targets = self.td_target(rewards, target_q_values, dones)
            
            self.critic.train(states, actions, td_targets)
            
            s_actions = self.actor.predict(states)
            s_grads = self.critic.q_grads(states, s_actions)
            grads = np.array(s_grads).reshape((-1, self.action_dim))
            self.actor.train(states, grads)
            self.target_update()

    def train(self, max_episodes=1000):
        for ep in range(max_episodes):      # train a bunch of episodes
            episode_reward, done = 0, False

            state = self.env.reset()
            bg_noise = np.zeros(self.action_dim)
            while not done:    # run till done by hitting the action that's done
#                 self.env.render()
                action = self.actor.get_action(state)   # pick an action, add noise, clip the action
                noise = self.ou_noise(bg_noise, dim=self.action_dim)
                action = np.clip(action + noise, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                self.buffer.put(state, action, (reward+8)/8, next_state, done)
                bg_noise = noise     # why does the noise wander in such a weird way
                episode_reward += reward
                state = next_state
                
            if self.buffer.size() >= wandb.config.batch_size and self.buffer.size() >= wandb.config.train_start:    # update the states if enough
                self.replay()                
            print('EP{} EpisodeReward={}'.format(ep, episode_reward))
            wandb.log({'Reward' + str(self.iden): episode_reward})
            
        return episode_reward
    


In [29]:
# def main():
if __name__ == "__main__":
    
    try: wandb.finish()
    except: pass
    
    ####configurations
    wandb.init(name='DDPG-long-single', project="deep-rl-tf2")
    env_name = 'Pendulum-v0'

    
    wandb.config.gamma = 0.99
    wandb.config.actor_lr = 0.001
    wandb.config.critic_lr = 0.0001
    wandb.config.batch_size = 64
    wandb.config.tau = 0.005
    wandb.config.train_start = 400
    wandb.config.episodes = 5
    wandb.config.num = 1
    wandb.config.epochs = 400

    wandb.config.actor = {'layer1': 128, 'layer2' : 128}
    wandb.config.critic = {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}
    
    print(wandb.config)
    
    # main run    
    N = wandb.config.num
    agents = []
    
    # set up the agent
    for i in range(N):
        env_t = gym.make(env_name)
        agents.append(Agent(env_t, i))

    # start the training
    for z in range(wandb.config.epochs):

        reward = 0
        # train the agent
        for j in range(len(agents)):
            print('Training Agent {}'.format(agents[j].iden))
            reward += agents[j].train(wandb.config.episodes)
    
        reward = reward / N
        print('Epoch={}\t Average reward={}'.format(z, reward))
        wandb.log({'batch': z, 'Epoch': reward})


        # get the average - actor and critic
        critic_avg = []
        actor_avg = []

        for i in range(len(agents[0].actor.model.get_weights())):
            
            actor_t = agents[0].actor.model.get_weights()[i]

            for j in range(1, N):
                actor_t += agents[j].actor.model.get_weights()[i]

            actor_t = actor_t / N
            actor_avg.append(actor_t)


        for i in range(len(agents[0].critic.model.get_weights())):
            critic_t = agents[0].critic.model.get_weights()[i]

            for j in range(1, N):
                critic_t += agents[j].critic.model.get_weights()[i]

            critic_t = critic_t / N
            critic_avg.append(critic_t)


        # set the average
        for j in range(N):
            agents[j].actor.model.set_weights(actor_avg)
            agents[j].critic.model.set_weights(critic_avg)


    # wrtie things out
    for j in range(N):
        with open("agent{}-actor.txt".format(j), "w") as f:
            f.write(str(agents[j].actor.model.get_weights()))
            f.close()
        wandb.save("agent{}-actor.txt".format(j))
        
        
        with open("agent{}-critic.txt".format(j), "w") as f:
            f.write(str(agents[j].critic.model.get_weights()))
            f.close()
        wandb.save("agent{}-critic.txt".format(j))

    
    wandb.finish()
    
# if __name__ == "__main__":
#     main()
    

{'gamma': 0.99, 'actor_lr': 0.001, 'critic_lr': 0.0001, 'batch_size': 64, 'tau': 0.005, 'train_start': 400, 'episodes': 5, 'num': 1, 'epochs': 200, 'actor': {'layer1': 128, 'layer2': 128}, 'critic': {'state1': 256, 'state2': 128, 'actor1': 128, 'cat1': 64}}
Training Agent 0
EP0 EpisodeReward=-1322.7037942237268
EP1 EpisodeReward=-1168.1260875273176
EP2 EpisodeReward=-1043.6276087472104
EP3 EpisodeReward=-1577.6668855669561
EP4 EpisodeReward=-1523.592785954304
Epoch=0	 Average reward=-1523.592785954304
Training Agent 0
EP0 EpisodeReward=-1538.7762605577761
EP1 EpisodeReward=-1556.510539357346
EP2 EpisodeReward=-1583.1704520809099
EP3 EpisodeReward=-1584.8621237550403
EP4 EpisodeReward=-1520.0664558755143
Epoch=1	 Average reward=-1520.0664558755143
Training Agent 0
EP0 EpisodeReward=-1637.6092407501499
EP1 EpisodeReward=-1617.8484384568048
EP2 EpisodeReward=-1478.1804375334257
EP3 EpisodeReward=-1412.6448458876316
EP4 EpisodeReward=-1554.3231358117396
Epoch=2	 Average reward=-1554.323135

EP0 EpisodeReward=-1532.1253877122117
EP1 EpisodeReward=-1505.4308261595365
EP2 EpisodeReward=-1507.7122721184487
EP3 EpisodeReward=-1628.3214412777613
EP4 EpisodeReward=-1516.9848309065228
Epoch=32	 Average reward=-1516.9848309065228
Training Agent 0
EP0 EpisodeReward=-1178.9830469131023
EP1 EpisodeReward=-1562.897262070943
EP2 EpisodeReward=-1174.7399568632422
EP3 EpisodeReward=-1269.5140641438766
EP4 EpisodeReward=-1609.1487418962145
Epoch=33	 Average reward=-1609.1487418962145
Training Agent 0
EP0 EpisodeReward=-1548.5629850745925
EP1 EpisodeReward=-1100.7691588501173
EP2 EpisodeReward=-1557.167151526689
EP3 EpisodeReward=-1625.6729039585516
EP4 EpisodeReward=-1345.7809236809687
Epoch=34	 Average reward=-1345.7809236809687
Training Agent 0
EP0 EpisodeReward=-1589.3973779163807
EP1 EpisodeReward=-1515.9625435225769
EP2 EpisodeReward=-1499.9361724804598
EP3 EpisodeReward=-1529.7072022997456
EP4 EpisodeReward=-1355.544361191
Epoch=35	 Average reward=-1355.544361191
Training Agent 0
EP

EP0 EpisodeReward=-1635.7512411060477
EP1 EpisodeReward=-1186.22066311141
EP2 EpisodeReward=-1269.5818128052258
EP3 EpisodeReward=-1191.9576275891905
EP4 EpisodeReward=-1509.5575400201758
Epoch=65	 Average reward=-1509.5575400201758
Training Agent 0
EP0 EpisodeReward=-1144.791821946767
EP1 EpisodeReward=-1393.7906601658115
EP2 EpisodeReward=-1502.2845004223768
EP3 EpisodeReward=-985.6591893977715
EP4 EpisodeReward=-1231.2479467626636
Epoch=66	 Average reward=-1231.2479467626636
Training Agent 0
EP0 EpisodeReward=-1195.8839690719406
EP1 EpisodeReward=-1288.1853414635098
EP2 EpisodeReward=-1350.0241818814545
EP3 EpisodeReward=-1365.022100201039
EP4 EpisodeReward=-1204.2790504892296
Epoch=67	 Average reward=-1204.2790504892296
Training Agent 0
EP0 EpisodeReward=-1505.7866785446668
EP1 EpisodeReward=-1135.1895461900103
EP2 EpisodeReward=-1168.8396976707168
EP3 EpisodeReward=-1390.9086212833865
EP4 EpisodeReward=-1651.8067711692677
Epoch=68	 Average reward=-1651.8067711692677
Training Agent

EP0 EpisodeReward=-1164.0732897003654
EP1 EpisodeReward=-1327.4525601396776
EP2 EpisodeReward=-1377.6658913054564
EP3 EpisodeReward=-1439.8608305569885
EP4 EpisodeReward=-1442.1895995123975
Epoch=98	 Average reward=-1442.1895995123975
Training Agent 0
EP0 EpisodeReward=-1501.3292258478123
EP1 EpisodeReward=-1500.2427892282112
EP2 EpisodeReward=-1496.8963019718708
EP3 EpisodeReward=-1532.2306614261372
EP4 EpisodeReward=-1446.6840893971512
Epoch=99	 Average reward=-1446.6840893971512
Training Agent 0
EP0 EpisodeReward=-1450.9712495421188
EP1 EpisodeReward=-1394.5773197329872
EP2 EpisodeReward=-1386.0276142441298
EP3 EpisodeReward=-1479.6606884912644
EP4 EpisodeReward=-1331.0278756780494
Epoch=100	 Average reward=-1331.0278756780494
Training Agent 0
EP0 EpisodeReward=-889.4078448694921
EP1 EpisodeReward=-883.2858591438846
EP2 EpisodeReward=-1376.0892434750856
EP3 EpisodeReward=-1231.0329445792818
EP4 EpisodeReward=-1205.7064613612843
Epoch=101	 Average reward=-1205.7064613612843
Training 

EP4 EpisodeReward=-1435.002268353166
Epoch=130	 Average reward=-1435.002268353166
Training Agent 0
EP0 EpisodeReward=-1531.310336614398
EP1 EpisodeReward=-1628.4245735393433
EP2 EpisodeReward=-1463.3424503897113
EP3 EpisodeReward=-1648.760714871811
EP4 EpisodeReward=-1511.1124490568254
Epoch=131	 Average reward=-1511.1124490568254
Training Agent 0
EP0 EpisodeReward=-1616.6621472639838
EP1 EpisodeReward=-1436.7923175430492
EP2 EpisodeReward=-1546.9494577304063
EP3 EpisodeReward=-1556.0655941309803
EP4 EpisodeReward=-1546.9713490093436
Epoch=132	 Average reward=-1546.9713490093436
Training Agent 0
EP0 EpisodeReward=-1538.9541096345863
EP1 EpisodeReward=-1498.5160746205895
EP2 EpisodeReward=-1560.189029309362
EP3 EpisodeReward=-1562.0394058928937
EP4 EpisodeReward=-1492.2199404813891
Epoch=133	 Average reward=-1492.2199404813891
Training Agent 0
EP0 EpisodeReward=-1491.647083826834
EP1 EpisodeReward=-1378.4780096383583
EP2 EpisodeReward=-1342.1791026125088
EP3 EpisodeReward=-1458.31609175

EP2 EpisodeReward=-1435.464981822563
EP3 EpisodeReward=-1422.1433953932456
EP4 EpisodeReward=-1501.180246420052
Epoch=163	 Average reward=-1501.180246420052
Training Agent 0
EP0 EpisodeReward=-1518.263390497734
EP1 EpisodeReward=-1457.851878097258
EP2 EpisodeReward=-1447.145718424395
EP3 EpisodeReward=-1526.8538977685064
EP4 EpisodeReward=-1555.230040762797
Epoch=164	 Average reward=-1555.230040762797
Training Agent 0
EP0 EpisodeReward=-1474.505730248403
EP1 EpisodeReward=-1464.500830882294
EP2 EpisodeReward=-1635.2329645397438
EP3 EpisodeReward=-1385.0652627758527
EP4 EpisodeReward=-1561.737359131706
Epoch=165	 Average reward=-1561.737359131706
Training Agent 0
EP0 EpisodeReward=-1516.666567367618
EP1 EpisodeReward=-1652.9524100402346
EP2 EpisodeReward=-1489.5129692017588
EP3 EpisodeReward=-1470.8800327502017
EP4 EpisodeReward=-1518.0210537840667
Epoch=166	 Average reward=-1518.0210537840667
Training Agent 0
EP0 EpisodeReward=-1519.9925389160894
EP1 EpisodeReward=-1614.9745040475834
E

EP0 EpisodeReward=-1528.3005759660389
EP1 EpisodeReward=-1527.1124983747459
EP2 EpisodeReward=-1503.3122661737964
EP3 EpisodeReward=-1199.0681960146796
EP4 EpisodeReward=-1223.4798154718562
Epoch=196	 Average reward=-1223.4798154718562
Training Agent 0
EP0 EpisodeReward=-1509.1391380847497
EP1 EpisodeReward=-1589.8859207679927
EP2 EpisodeReward=-1203.069437080587
EP3 EpisodeReward=-1455.442991218841
EP4 EpisodeReward=-1634.3634034413599
Epoch=197	 Average reward=-1634.3634034413599
Training Agent 0
EP0 EpisodeReward=-1580.4281665335286
EP1 EpisodeReward=-1568.1379216782775
EP2 EpisodeReward=-1160.47869575608
EP3 EpisodeReward=-1543.6386560815056
EP4 EpisodeReward=-1524.496858561221
Epoch=198	 Average reward=-1524.496858561221
Training Agent 0
EP0 EpisodeReward=-1452.2057774009354
EP1 EpisodeReward=-1545.912842699872
EP2 EpisodeReward=-1507.0758644998489
EP3 EpisodeReward=-1414.1748460976828
EP4 EpisodeReward=-1374.496612724475
Epoch=199	 Average reward=-1374.496612724475


0,1
Reward0,-1374.49661
_step,1199.0
_runtime,8324.0
_timestamp,1605730101.0
batch,199.0
Epoch,-1374.49661


0,1
Reward0,▂▁▄▂▅▃▃▂█▂▃▄▃█▂▄▃▄▃▄▄▃▃▃▁▂▂▃▃▆▁▃▃▃▂▄▂▂▃▂
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
batch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Epoch,▃▃▃▃▄▁▃▅▅▃▃▄█▆▂▃▄▄▃█▆▃▃▂▄▃▃▃▃▂▄▃▃▃▄▃▃▃▃▅


In [17]:
act_name = "agent{}-actor.h5".format(0)
crit_name = "agent{}-critic.h5".format(0)

agents[0].saver(act_name, crit_name)

# def saver(model, na):
#     model.save(na)
    
# agents[0].actor.model.get_weights()
# saver(agents[0].actor.model, name)

AttributeError: 'Agent' object has no attribute 'saver'

In [35]:
# import pandas
wandb.finish()
api = wandb.Api()
run = api.run("victor-qin/deep-rl-tf2/1s1ac3wo")
temp = run.history()
# print(run.scan_history())
# enumerate(run.history())

In [26]:
if run.state == "finished":
    for i, row in enumerate(run.scan_history()):
        try: print(row["_runtime"],'\t', row["Epoch"])
        except: pass

204 	 -1397.4979261457054
411 	 -1314.97677473863
616 	 -1675.7039031101483
819 	 -1728.7964805138092
1026 	 -1527.2705949310778
1237 	 -1578.1663064042016
1447 	 -1562.6616137080212
1656 	 -1570.4740155175275
1866 	 -1766.325876451435
2070 	 -1645.5299593648642
2285 	 -1418.3971006402744
2502 	 -1555.6624956007024
2718 	 -1214.3700774544943
2932 	 -1561.5437283128983
3144 	 -1403.9980968619263
3407 	 -1367.411475604592
3623 	 -1393.962258191116
3830 	 -1406.6523690831484
4029 	 -1101.6098018066943
4231 	 -1269.6366791527905
4449 	 -1498.5555164487466
4673 	 -1298.3502109839903
4894 	 -1638.4867399099965
5118 	 -1412.7765626408752
5334 	 -994.9669943700595
5534 	 -1320.8340664813616
5742 	 -1543.0258378183478
5951 	 -1445.0463899960896
6160 	 -1340.442215837614
6363 	 -1392.9779538323928
6585 	 -1430.2436505694286
6800 	 -1292.2180494001896
7001 	 -1243.6286137615236
7201 	 -1505.2007945599473
7407 	 -1393.522384897824
7618 	 -1316.374957483535
7821 	 -1227.821696099155
8018 	 -1256.30

In [33]:
# critic_avg = []
# actor_avg = []
# for i in range(len(agent1.actor.model.get_weights())):
#     critic_avg.append(agent1.critic.model.get_weights()[i] + agent2.critic.model.get_weights()[i])
#     actor_avg.append(agent1.actor.model.get_weights()[i] + agent2.actor.model.get_weights()[i])
    
#     agent1.critic.model.set_weights = critic_avg[i]
#     agent2.critic.model.set_weights = critic_avg[i]
    
#     agent1.actor.model.set_weights = actor_avg[i]
#     agent2.actor.model.set_weights = actor_avg[i]
    
# agent1.train(1)
# agent2.train(1)

EP0 EpisodeReward=-1652.69530928303
EP0 EpisodeReward=-1455.6688225531002


In [7]:
# def main():
#     wandb.init(name='DDPG', project="deep-rl-tf2")
#     env_name = 'Pendulum-v0'
#     env = gym.make(env_name)
#     agent = Agent(env)
#     agent.train()


# if __name__ == "__main__":
#     main()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mvictor-qin[0m (use `wandb login --relogin` to force relogin)


NoSuchDisplayException: Cannot connect to "None"