Skip to content
Permalink
Browse files

Updagrade the version of mxnet to 1.4.* for mxnetcontainerdockerfile. (

…#675)

* Updagrade the version of mxnet to 1.4.* for mxnetcontainerdockerfile.

* Added state logging to mxnet test

* Fixed an error

* Created a new test util function to check broken model logs

* fixed a wrong import bug

* Added a special print to see why it is broken when parsing

* more logs to fix bugs

* Fixed devDockerfiles to use mxnet==1.4.*
  • Loading branch information...
rkooo567 authored and simon-mo committed May 14, 2019
1 parent 8051a4e commit 7d1d769b61ad3edc1cd41c545f7b781a4c026dd7
@@ -16,7 +16,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -qq && apt-get install -y -qq
&& rm -rf /var/lib/apt/lists/*

RUN pip install -q cloudpickle==0.5.* pyzmq==17.0.* requests==2.18.* subprocess32==3.2.* scikit-learn==0.19.* \
numpy==1.14.* pyyaml==3.12.* docker==3.1.* kubernetes==6.0.* tensorflow==1.6.* mxnet==1.1.* pyspark==2.3.* \
numpy==1.14.* pyyaml==3.12.* docker==3.1.* kubernetes==6.0.* tensorflow==1.6.* mxnet==1.4.* pyspark==2.3.* \
xgboost==0.7.* jsonschema==2.6.* psutil==5.4.* prometheus_client

# Install PyTorch
@@ -24,6 +24,7 @@ RUN pip3 install cloudpickle==0.5.* pyzmq==17.0.* requests==2.18.* scikit-learn=
numpy==1.14.* pyyaml==3.12.* docker==3.1.* kubernetes==5.0.* tensorflow==1.6.* mxnet==1.1.* pyspark==2.3.* \
xgboost==0.7.* urllib3==1.24.* # CI is broken when urllib3's version is 1.25.1. Delete urllib3==1.24.* later once version compatibility is stabilized
# Install PyTorch
RUN pip3 install torch==1.0.* torchvision==0.2.*
@@ -5,7 +5,7 @@ FROM ${REGISTRY}/${RPC_VERSION}-rpc:${CODE_VERSION}

LABEL maintainer="Dan Crankshaw <dscrankshaw@gmail.com>"

RUN pip install -q mxnet==1.1.*
RUN pip install -q mxnet==1.4.*

COPY containers/python/mxnet_container.py containers/python/container_entry.sh /container/

@@ -41,6 +41,7 @@ def get_metrics_config():
def get_matched_query(metric_addr, metric_name):
query = gen_match_query(metric_addr, metric_name)
logger.info("Querying: {}".format(query))
print('test why docker query is broken: {}'.format(repr(query)))
res = requests.get(query).json()
logger.info(res)
return res
@@ -12,7 +12,8 @@

import mxnet as mx

from test_utils import create_docker_connection, BenchmarkException, headers
from test_utils import (create_docker_connection, BenchmarkException, headers,
log_clipper_state, log_docker, log_cluster_model)

cur_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.abspath("%s/../clipper_admin" % cur_dir))
@@ -158,6 +159,9 @@ def get_test_point():

except BenchmarkException:
logger.exception("BenchmarkException")
log_docker(clipper_conn)
log_cluster_model(clipper_conn, cluster_name)
log_clipper_state(clipper_conn)
clipper_conn = create_docker_connection(
cleanup=True, start_clipper=False, cleanup_name=cluster_name)
sys.exit(1)
@@ -166,6 +170,9 @@ def get_test_point():
cleanup=True, start_clipper=False, cleanup_name=cluster_name)
except Exception:
logger.exception("Exception")
log_docker(clipper_conn)
log_cluster_model(clipper_conn, cluster_name)
log_clipper_state(clipper_conn)
clipper_conn = create_docker_connection(
cleanup=True, start_clipper=False, cleanup_name=cluster_name)

@@ -232,6 +232,35 @@ def log_docker(clipper_conn):
for cont in container_runing:
logger.info('Name {}, Image {}, Status {}, Label {}'.format(
cont.name, cont.image, cont.status, cont.labels))
logger.info(cont.logs())


def log_cluster_model(clipper_conn, cluster_name):
if clipper_conn is None:
return

"""Retrieve status and log for last ten containers"""
container_runing = clipper_conn.cm.docker_client.containers.list(limit=100, all=True)
logger.info('================Cluster model logs====================')
logger.info('It includes broken models')
logger.info('----------------------')
logger.info('Model container status (including broken one)')
for cont in container_runing:
if cluster_name in cont.name:
logger.info('Name {}, Image {}, Status {}, Label {}'.format(
cont.name, cont.image, cont.status, cont.labels))

logger.info('----------------------')
logger.info('Printing out model logs')

for cont in container_runing:
print('cluster_name: {}'.format(cluster_name))
print('container_name: {}'.format(cont.name))
if cluster_name in cont.name:
logger.info('Name {}, Image {}, Status {}, Label {}'.format(
cont.name, cont.image, cont.status, cont.labels))
logger.info(cont.logs())

try:
logger.info(cont.logs())
except docker.errors.APIError as e:

0 comments on commit 7d1d769

Please sign in to comment.
You can’t perform that action at this time.