# Setup

Turn on autoreload. All modules will be reloaded before excecuting each cell. Even objects that already exist will receive the new method defintions.

In [1]:
%load_ext autoreload
%autoreload 2

Import stuff.

In [2]:
import logging
import sys
import os
import atexit

sys.path.insert(0, os.path.join(os.getcwd(), ".."))
from cirrus import automate, GridSearch, LogisticRegression, graph

Configure logging so that debug-level log messages are printed.

In [3]:
log = logging.getLogger("cirrus")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler(sys.stdout))

# Done on release

Create the server image (and an instance for that purpose).

In [None]:
instance_for_server_image = automate.Instance(
    "cirrus_instance_for_server_image",
    ami_id="ami-04403d81a81d1a5da",
    disk_size=32,
    typ="m5.2xlarge",
    username="ec2-user"
)
instance_for_server_image.start()
automate.make_server_image(
    "cirrus_server_image",
    "s3://cirrus-public/from_shared_vm/executables",
    instance_for_server_image
)
instance_for_server_image.cleanup()

Make the lambda package.

In [8]:
automate.make_lambda_package(
    "s3://cirrus-public-us-west-2/1/lambda-package",
    "s3://cirrus-public/from_shared_vm/executables"
)

make_lambda_package: Initializing ZIP file.
make_lambda_package: Writing handler.
make_lambda_package: Initializing S3.
make_lambda_package: Downloading executable.
make_lambda_package: Writing executable.
make_lambda_package: Uploading package.
make_lambda_package: Waiting for changes to take effect.
make_lambda_package: Done.


# Done by user

Make the lambda.

In [None]:
automate.make_lambda("cirrus_worker", "s3://cirrus-public-us-west-2/1/lambda-package", 100)

Make the instance for the parameter servers.

In [4]:
instance_for_ps = automate.Instance(
    "cirrus_instance_for_ps",
    ami_name="cirrus_server_image",
    disk_size=32,
    typ="m5.2xlarge",
    username="ec2-user"
)
instance_for_ps.start()

__init__: Initializing EC2.
__init__: Resolving AMI name to AMI ID.
__init__: Done.
start: Calling _make_instance_profile.
_make_instance_profile: Initializing IAM.
_make_instance_profile: Creating role.
_make_instance_profile: Attaching policy to role.
_make_instance_profile: Creating instance profile.
_make_instance_profile: Adding role to instance profile.
_make_instance_profile: Waiting for changes to take effect.
_make_instance_profile: Done.
start: Calling _make_key_pair.
_make_key_pair: Creating new key pair.
_make_key_pair: Saving private key.
_make_key_pair: Fetching key metadata.
_make_key_pair: Done.
start: Calling _make_security_group.
_make_security_group: Creating new security group.
_make_security_group: Configuring security group.
_make_security_group: Done.
start: Calling _start_and_wait.
_start_and_wait: Starting a new instance.
_start_and_wait: Waiting for instance to enter running state.
_start_and_wait: Fetching instance metadata.
_start_and_wait: Done.
start: Done

Define the logistic regression configuration.

In [5]:
basic_params = {
    'n_workers': 16,
    'n_ps': 1,
    'lambda_size': 128,
    'dataset': "criteo-kaggle-19b",
    'learning_rate': 0.001,
    'epsilon': 0.0001,
    'progress_callback': None,
    'timeout': 60,
    'opt_method': 'adagrad',
    'minibatch_size': 200,
    'model_bits': 19,
    'train_set': (0,6),
    'test_set': (7,8)
}

Start a grid search.

In [6]:
os.system("aws logs delete-log-group --log-group-name /aws/lambda/cirrus_worker")
gs = GridSearch(
    task=LogisticRegression,
    param_base=basic_params,
    hyper_vars=["learning_rate"],
    hyper_params=[[0.001, 0.01, 0.1, 1]],
    instances=[instance_for_ps]
)
gs.set_threads(1)
gs.run(UI=False)

start: Uploading configuration.
start: Starting parameter server.
start: Retreiving parameter server PID.
start: Starting error task.
start: Retreiving error task PID.
launch_worker: Launching Task 0.
launch_worker: Launching Task 10000.
launch_worker: Launching Task 20000.
launch_worker: Launching Task 30000.
launch_worker: Launching Task 40000.
launch_worker: Launching Task 60000.
launch_worker: Launching Task 50000.
launch_worker: Launching Task 70000.
launch_worker: Launching Task 80000.
launch_worker: Launching Task 90000.
launch_worker: Launching Task 100000.
launch_worker: Launching Task 110000.
launch_worker: Launching Task 120000.
launch_worker: Launching Task 130000.
launch_worker: Launching Task 140000.
start: Uploading configuration.
ClientManager: Initializing Lambda client.
launch_worker: Launching Task 150000.
ClientManager: Initializing Lambda client.
ClientManager: Initializing Lambda client.
ClientManager: Initializing Lambda client.
ClientManager: Initializing Lambda

Sleeping for 119.801254988launch_worker: Task 30040000 completed with status code 200.

launch_worker: Task 30000000 completed with status code 200.
Sleeping for 119.805773973
launch_worker: Task 30010000 completed with status code 200.
Sleeping for 119.794450998
launch_worker: Task 30050000 completed with status code 200.
Sleeping for 119.793909073
launch_worker: Task 30030000 completed with status code 200.
Sleeping for 119.800063133
Sleeping for 119.7954638
launch_worker: Task 30080000 completed with status code 200.
Sleeping for 119.79794693
launch_worker: Task 30060000 completed with status code 200.
Sleeping for 119.791181087launch_worker: Task 30140000 completed with status code 200.

launch_worker: Task 30120000 completed with status code 200.
Sleeping for 119.788357973
launch_worker: Task 30130000 completed with status code 200.
Sleeping for 119.781265974
launch_worker: Task 30100000 completed with status code 200.
Sleeping for 119.778739929
launch_worker: Task 30150000 comple

View the progress of the experiment.

In [None]:
graph.display_dash()

# Utilities

Write the SSH key for the instance to a file.

In [None]:
with open("key.pem", "w+") as f:
    f.write(instance_for_ps.private_key())

View the instance's public IP address.

In [None]:
instance_for_ps.public_ip()

View the output of the parameter server of the first experiment.

In [None]:
gs.cirrus_objs[0].ps.error_output()

In [None]:
gs.cirrus_objs[0].ps.ps_output()

In [7]:
gs.kill_all()

concurrency_limit: Querying the Lambda's concurrency limit.
run_command: Running `kill -9 $(cat error_1337.pid)`.
run_command: Waiting for completion.
run_command: Exit code was 0.
run_command: Fetching stdout and stderr.
run_command: stdout had length 0.
run_command: stderr had length 0.
run_command: Done.
run_command: Running `kill -9 $(cat ps_1337.pid)`.
run_command: Waiting for completion.
run_command: Exit code was 0.
run_command: Fetching stdout and stderr.
run_command: stdout had length 0.
run_command: stderr had length 0.
run_command: Done.
run_command: Running `kill -9 $(cat error_1339.pid)`.
run_command: Waiting for completion.
run_command: Exit code was 0.
run_command: Fetching stdout and stderr.
run_command: stdout had length 0.
run_command: stderr had length 0.
run_command: Done.
run_command: Running `kill -9 $(cat ps_1339.pid)`.
run_command: Waiting for completion.
run_command: Exit code was 0.
run_command: Fetching stdout and stderr.
run_command: stdout had length 0.
run

Exception in thread Worker 0:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1087, in maintain_one
    launch_worker(lambda_name, task_id, config, n, ps)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1036, in launch_worker
    Payload=json.dumps(payload)
  File "/usr/local/lib/python2.7/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/usr/local/lib/python2.7/site-packages/botocore/client.

launch_worker: Launching Task 10000001.
launch_worker: Launching Task 10010001.
launch_worker: Launching Task 10020001.
launch_worker: Launching Task 10030001.
launch_worker: Launching Task 10040001.
launch_worker: Launching Task 10050001.
launch_worker: Launching Task 10060001.
launch_worker: Launching Task 10080001.
launch_worker: Launching Task 10070001.
launch_worker: Launching Task 10140001.
launch_worker: Launching Task 10150001.
launch_worker: Launching Task 10120001.
launch_worker: Launching Task 10130001.
launch_worker: Launching Task 10110001.
launch_worker: Launching Task 10100001.
launch_worker: Launching Task 10090001.


Exception in thread Worker 1000:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1087, in maintain_one
    launch_worker(lambda_name, task_id, config, n, ps)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1036, in launch_worker
    Payload=json.dumps(payload)
  File "/usr/local/lib/python2.7/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/usr/local/lib/python2.7/site-packages/botocore/clie

launch_worker: Launching Task 20010001.
launch_worker: Launching Task 20000001.
launch_worker: Launching Task 20030001.
launch_worker: Launching Task 20020001.
launch_worker: Launching Task 20040001.
launch_worker: Launching Task 20050001.
launch_worker: Launching Task 20070001.
launch_worker: Launching Task 20080001.
launch_worker: Launching Task 20060001.
launch_worker: Launching Task 20110001.
launch_worker: Launching Task 20120001.
launch_worker: Launching Task 20100001.
launch_worker: Launching Task 20090001.
launch_worker: Launching Task 20130001.
launch_worker: Launching Task 20150001.
launch_worker: Launching Task 20140001.


Exception in thread Worker 2001:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1087, in maintain_one
    launch_worker(lambda_name, task_id, config, n, ps)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1036, in launch_worker
    Payload=json.dumps(payload)
  File "/usr/local/lib/python2.7/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/usr/local/lib/python2.7/site-packages/botocore/clie

launch_worker: Launching Task 30000001.
launch_worker: Launching Task 30010001.
launch_worker: Launching Task 30020001.
launch_worker: Launching Task 30030001.
launch_worker: Launching Task 30040001.
launch_worker: Launching Task 30050001.
launch_worker: Launching Task 30070001.
launch_worker: Launching Task 30080001.
launch_worker: Launching Task 30110001.
launch_worker: Launching Task 30090001.
launch_worker: Launching Task 30100001.
launch_worker: Launching Task 30060001.
launch_worker: Launching Task 30120001.
launch_worker: Launching Task 30130001.
launch_worker: Launching Task 30140001.
launch_worker: Launching Task 30150001.


Exception in thread Worker 3000:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1087, in maintain_one
    launch_worker(lambda_name, task_id, config, n, ps)
  File "/Users/shea/Google Drive/-/Work/Cirrus/Repositories/Fixes/python/frontend/cirrus/examples/../cirrus/automate.py", line 1036, in launch_worker
    Payload=json.dumps(payload)
  File "/usr/local/lib/python2.7/site-packages/botocore/client.py", line 320, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/usr/local/lib/python2.7/site-packages/botocore/clie

Create a new SSH connection to the instance. Useful when the connection times out while you're aware from the computer.

In [None]:
instance_for_ps._connect_ssh()

Do cleanup. Normally this is called on exit, but Jupyter does not reliably allow this to complete.

In [None]:
atexit._run_exitfuncs()

cleanup: Closing SSH client.
cleanup: Terminating instance.
cleanup: Waiting for instance to terminate.
