## Demo of the Myria Middleware 


### Setup steps:
* Install Raco:
    * Follow the steps listed on https://github.com/uwescience/raco/tree/SPJA_federation#setup to set up RACO.
* Install SPARK:
    * Download the prebuilt HADOOP version from http://spark.apache.org/downloads.html and follow the setup steps on SPARK website to verify if it works. (Make sure you have all the dependencies listed on the page installed)
    * Set the SPARK_HOME environment variable to the downloaded SPARK folder and add it to PATH
        * On mac, this can be done by adding the following lines to bash_profile or bashrc file:
`
export SPARK_HOME=</path/to/spark>
export PATH=$SPARK_HOME/bin:$PATH
`

* Install jupyter:
    * on mac, this can be done via: `pip install jupyter`
    
* Run `ipython notebook` command. And browse to this notebook. 

* Update the path to the input dataset in the code below^: 
`
matA = scan('/path/to/datafile');
`

^ (As a sample, you can download and use https://github.com/uwescience/raco/blob/SPJA_federation/demo/sample.dat)


In [1]:
import findspark
findspark.init()
findspark.add_packages('com.databricks:spark-csv_2.10:1.4.0')
from raco.backends.cpp import CCAlgebra
from raco.backends.logical import OptLogicalAlgebra
from raco.backends.spark.connection import SparkConnection
from raco.backends.spark.catalog import SparkCatalog
from raco.backends.spark.algebra import SparkAlgebra
from raco.backends.myria.connection import MyriaConnection
from raco.backends.myria.catalog import MyriaCatalog
from raco.backends.myria import MyriaLeftDeepTreeAlgebra
from raco.backends.federated.connection import FederatedConnection
from raco.backends.federated.catalog import FederatedCatalog
from raco.backends.federated import FederatedAlgebra
from raco.backends.federated.algebra import FederatedExec
from raco.catalog import FromFileCatalog
from raco.compile import optimize
from raco.compile import compile
import raco.myrial.interpreter as interpreter
import raco.myrial.parser as myrialparser
from optparse import OptionParser

import raco.viz
import time
import os

In [2]:
os.environ.get('MYRIAX_REST_HOST', 'localhost')

'ec2-52-42-66-221.us-west-2.compute.amazonaws.com'

In [3]:
masterHostname = os.environ.get('sparkurl', 'localhost')
def get_myria_connection():
    execution_url = os.environ.get('MYRIAX_REST_HOST', 'localhost')
    connection = MyriaConnection(hostname=execution_url, port=8753)
    return connection

def get_spark_connection():
    if masterHostname == 'localhost':
        return SparkConnection(masterHostname)
    return SparkConnection("spark://{masterHostname}:7077".format(masterHostname=masterHostname))


In [4]:
program_fquery_simple ="""
NF = scan(netflow);
NFSUB = select SrcAddr as src_ip, DstAddr as dst_ip, 1.0 as value from NF where TotBytes > 5120;
DNS = scan('/Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt');
graph = select d1.dns as row, d2.dns as col, n.value from NFSUB n, DNS d1, DNS d2
    where n.src_ip = d1.ip and n.dst_ip = d2.ip;
store(graph, ipGraph);
"""

program_fquery="""
NF = scan(netflow);
NFSUB = select SrcAddr as src_ip, SrcAddr as dst_ip, 1.0 as value from NF where TotBytes > 5120;
DNS = scan('/Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt');
graph = select d1.dns as row, d2.dns as col, n.value from NFSUB n, DNS d1, DNS d2
    where n.src_ip = d1.ip and n.dst_ip = d2.ip;
gammas = select a.row as u, b.row as v, count(b.value) as gamma from graph a, graph b where a.col == b.col;
out_d = select row, count(value) as od from graph;
J = select a.u as src_name, a.v as dst_name, a.gamma/(b.od + c.od - a.gamma) as jaccard_coeff from gammas a, out_d b, out_d c where a.u = b.row and a.v = c.row;

store(J, nameJaccard);
"""

In [5]:
myriaconn = get_myria_connection()
sparkconn = get_spark_connection()

In [6]:
myriacatalog = MyriaCatalog(myriaconn)
catalog_path = os.path.join(os.path.abspath('./examples/'), 'catalog.py')
sparkcatalog = SparkCatalog.load_from_file(catalog_path)
catalog = FederatedCatalog([myriacatalog, sparkcatalog])

In [7]:
parser = myrialparser.Parser()
processor = interpreter.StatementProcessor(catalog, True)
statement_list = parser.parse(program_fquery)
processor.evaluate(statement_list)
logical_plan = processor.get_logical_plan()
dot_logical = raco.viz.operator_to_dot_object(logical_plan)

In [8]:
dot_logical.view()

'Digraph.gv.pdf'

In [9]:
algebras = [OptLogicalAlgebra(), MyriaLeftDeepTreeAlgebra(), SparkAlgebra()]
falg = FederatedAlgebra(algebras, catalog)



In [11]:
federated_plan = processor.get_physical_plan(target_alg=falg)

Relation not present try other catalogs


LookupError: Relation public:adhoc:V6093633579 not found in any catalogs

In [14]:
raco.viz.operator_to_dot_object(federated_plan.args[0].plan).view()

'Digraph.gv.pdf'

In [15]:
physical_plan_spark = optimize(federated_plan, SparkAlgebra())
dot_physical_spark = raco.viz.operator_to_dot_object(physical_plan_spark)

In [16]:
dot_physical_spark.view()

'Digraph.gv.pdf'

In [17]:
sparkconn.execute_query(physical_plan_spark)

localhost V6093633579
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost V6123388638
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost V8595577825
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost V9025082040
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
localhost /Users/shrainik/Dropbox/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt
+--------------------+--------------------+-------------+
|            src_name|            dst_name|jaccard_coeff|
+--------------------+--------------------+-------------+
|adsl-69-235-227-1

(16, 'nameJaccard')