# Publish Model to Data Catalog

In [1]:
# First, let's verify that the sparktk libraries are installed
import sparktk
print "sparktk installation path = %s" % (sparktk.__path__)

sparktk installation path = ['/opt/anaconda2/lib/python2.7/site-packages/sparktk']


In [2]:
# This notebook assumes you have already created a credentials file.
# Enter the path here to connect to ATK
from sparktk import TkContext
tc = TkContext()

In [3]:
# Create a new frame by uploading rows
data = [ [4.9,1.4,0], 
        [4.7,1.3,0], 
        [4.6,1.5,0], 
        [6.3,4.9,1],
        [6.1,4.7,1], 
        [6.4,4.3,1], 
        [6.6,4.4,1], 
        [7.2,6.0,2],
        [7.2,5.8,2], 
        [7.4,6.1,2], 
        [7.9,6.4,2]]

schema = [('Sepal_Length', float),
          ('Petal_Length', float),
          ('Class', int)]
frame = tc.frame.create(data, schema)

In [4]:
# Consider the following frame containing three columns.
frame.inspect()

[#]  Sepal_Length  Petal_Length  Class
[0]           4.9           1.4      0
[1]           4.7           1.3      0
[2]           4.6           1.5      0
[3]           6.3           4.9      1
[4]           6.1           4.7      1
[5]           6.4           4.3      1
[6]           6.6           4.4      1
[7]           7.2           6.0      2
[8]           7.2           5.8      2
[9]           7.4           6.1      2

In [5]:
# Create a new model and train it
model = tc.models.classification.naive_bayes.train(frame, 'Class', ['Sepal_Length', 'Petal_Length'])

In [6]:
# Export the trained model to MAR format
model.export_to_mar("hdfs://nameservice1/user/vcap/example_naive_bayes_model.mar")

u'hdfs://nameservice1/user/vcap/example_naive_bayes_model.mar'

In [7]:
# Import Data Catalog client module from tap_catalog
from tap_catalog import DataCatalog

In [8]:
# Create an instance of Data Catalog
## data_catalog = DataCatalog('TAP_DOMAIN_URI', 'TAP_USERNAME', 'TAP_PASSWORD') # For Scripting purposes
data_catalog = DataCatalog()

Please input tap domain uri:atk-qa-nokrb.gotapaas.eu
Please input user name:sparktk
Please input password:········


In [9]:
# Add an entry to Data Catalog
data_catalog.add("/user/vcap/example_naive_bayes_model.mar")

In [10]:
# Inspect HDFS directly using hdfsclient

import hdfsclient
from hdfsclient import ls, mkdir, rm, mv

In [11]:
ls("/user/vcap/example_naive_bayes_model.mar")

permissions      block_replication  owner    group    size      last_modification    path
-------------  -------------------  -------  -------  --------  -------------------  ----------------------------------------
-rwxrwx---                       3  vcap     vcap     28765246  2016-10-19 21:19     /user/vcap/example_naive_bayes_model.mar

In [12]:
# Cleanup the file from HDFS
## (This does not delete from data catalog. Remember to delete it from the Data Catalog UI)
rm("/user/vcap/example_naive_bayes_model.mar")

[{'path': '/user/vcap/example_naive_bayes_model.mar', 'result': True}]