In [1]:
import time
import os
import raco.viz

import findspark
findspark.init()
findspark.add_packages('com.databricks:spark-csv_2.10:1.4.0')

from raco.backends.spark.connection import SparkConnection
from raco.backends.spark.catalog import SparkCatalog
from raco.backends.spark.algebra import SparkAlgebra

from raco.backends.logical import OptLogicalAlgebra
from raco.catalog import FromFileCatalog
import raco.myrial.interpreter as interpreter
import raco.myrial.parser as myrialparser

from raco.backends.myria import MyriaLeftDeepTreeAlgebra
from raco.backends.myria.connection import MyriaConnection
from raco.backends.myria.catalog import MyriaCatalog

import accumulo
from raco.backends.accumulo.connection import AccumuloConnection
from raco.backends.accumulo.catalog import AccumuloCatalog

from raco.backends.federated.connection import FederatedConnection
from raco.backends.federated.catalog import FederatedCatalog
from raco.backends.federated.algebra import FederatedAlgebra

In [2]:
masterHostname = os.environ.get('sparkurl', 'localhost')
def get_accumulo_connection():
    host='localhost'
    port=42424
    user='root'
    pw='secret'
    return AccumuloConnection(host, port, user, pw) 

def get_spark_connection():
    masterHostname = os.environ.get('sparkurl', 'localhost')
    if masterHostname == 'localhost':
        return SparkConnection(masterHostname)
    return SparkConnection("spark://{masterHostname}:7077".format(masterHostname=masterHostname))


In [3]:
##Initialize Backend Accumulo
accumuloconn = get_accumulo_connection()
accumulocatalog = AccumuloCatalog(accumuloconn)

##Initialize Backend Spark
sparkconn = get_spark_connection()
sparkcatalog = SparkCatalog.load_from_file(os.path.join(os.path.abspath('./examples/'), 'catalog.py'))

##Initialize Federated Backend
fed_conn = FederatedConnection([accumuloconn, sparkconn])
fed_catalog = FederatedCatalog([accumulocatalog, sparkcatalog])

In [5]:
##The Jaccard Computation expressed as MyriaL query
program_fquery="""
-- Scan netflow data (this resides in Accumulo)
NF = scan(netflow);
-- Filter the dataset (runs on Accumulo)
NFSUB = select SrcAddr as src_ip, DstAddr as dst_ip, 1.0 as value from NF where TotBytes > 100;
-- Scan DNS dataset (this is accessible to Spark)
DNS = scan('/home/dhutchis/gits/raco/examples/fed_accumulo_spark_c/dnssample_parsed.txt');
-- Perform a Join (Automatically move the datasets as necessary)
graph = select d1.dns as row, d2.dns as col, n.value
		from NFSUB n, DNS d1, DNS d2
	    where n.src_ip = d1.ip and n.dst_ip = d2.ip;

--Begin Jaccard
-- Calculate common neighbours
gammas = select a.row as u, b.row as v, count(b.value) as gamma
		 from graph a, graph b
         where a.col == b.col;

-- Calculate the out_degree of each vertex
out_d = select row, count(value) as od from graph;

-- Calculate the Jaccard Coefficients for all pairs (u,v);
J = select a.u as src_name, a.v as dst_name, a.gamma/(b.od + c.od - a.gamma) as jaccard_coeff
	from gammas a, out_d b, out_d c
    where a.u = b.row and a.v = c.row;

-- Store the result
store(J, nameJaccard);
"""

In [6]:
##Parse the query
parser = myrialparser.Parser()
processor = interpreter.StatementProcessor(fed_catalog, True)
statement_list = parser.parse(program_fquery)
processor.evaluate(statement_list)

Relation not present try other catalogs
Relation not present try other catalogs


In [None]:
##Lets Visualize the query
logical_plan = processor.get_logical_plan()
dot_logical = raco.viz.operator_to_dot_object(logical_plan)
dot_logical.view()

In [7]:
##Let's optimize the query and let the federation figure out the right backends
algebras = [OptLogicalAlgebra(), MyriaLeftDeepTreeAlgebra(), SparkAlgebra()]
falg = FederatedAlgebra(algebras, fed_catalog)
federated_plan = processor.get_physical_plan(target_alg=falg)

Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs


In [None]:
##Visualizing the physical plan
raco.viz.operator_to_dot_object(federated_plan).view()

In [8]:
##Submit the plan for federated execution
fed_conn.execute_query(federated_plan)

+--------------------+--------------------+------------------+
|            src_name|            dst_name|     jaccard_coeff|
+--------------------+--------------------+------------------+
|       hn.kd.ny.adsl|       hn.kd.ny.adsl|               1.0|
|178.111.dsl.mel.i...|       hn.kd.ny.adsl|              0.25|
|97-127-34-181.mpl...|       hn.kd.ny.adsl|               0.5|
|smtp179.emergency...|       hn.kd.ny.adsl|0.3333333333333333|
|189.114.211.178.d...|189.114.211.178.d...|               1.0|
|       hn.kd.ny.adsl|178.111.dsl.mel.i...|              0.25|
|178.111.dsl.mel.i...|178.111.dsl.mel.i...|               1.0|
|smtp179.emergency...|178.111.dsl.mel.i...|              0.25|
|       hn.kd.ny.adsl|97-127-34-181.mpl...|               0.5|
|97-127-34-181.mpl...|97-127-34-181.mpl...|               1.0|
|smtp179.emergency...|97-127-34-181.mpl...|               0.5|
|ip65-47-243-178.z...|ip65-47-243-178.z...|               1.0|
|       hn.kd.ny.adsl|smtp179.emergency...|0.3333333333

(16, 'nameJaccard')

In [12]:
from raco.backends.cpp.cpp import CCAlgebra

##Initialize C Backend Catalog
c_catalog = FromFileCatalog.load_from_file(os.path.join(
                                               os.path.dirname('/home/dhutchis/gits/raco/examples/'), 
                                               'catalog.py'))

##Initialize Federated Backend -- With the C catalog
fed_conn = FederatedConnection([accumuloconn])
fed_catalog = FederatedCatalog([accumulocatalog, c_catalog])

##Parse the query -- SAME AS BEFORE
parser = myrialparser.Parser()
processor = interpreter.StatementProcessor(fed_catalog, True)
statement_list = parser.parse(program_fquery)
processor.evaluate(statement_list)

##Change to C Algebra 
algebras = [OptLogicalAlgebra(), MyriaLeftDeepTreeAlgebra(), CCAlgebra()]
falg = FederatedAlgebra(algebras, fed_catalog, crossproducts=False)
federated_plan = processor.get_physical_plan(target_alg=falg)
##Visualizing the physical plan
raco.viz.operator_to_dot_object(federated_plan).view()

Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs
Relation not present try other catalogs


'Digraph.gv.pdf'

In [13]:
fed_conn.execute_query(federated_plan)

// Precount_select: Use buckets to track the number of matches
// Use buckets to copy into the result array
#include <cstdio>
#include <cstdlib>     // for exit()
#include <fcntl.h>      // for open()
#include <unistd.h>     // for close()
#include <sys/stat.h>   // for fstat()
#include <ctype.h>      // for isdigit()
#include <cstring>
#include <errno.h>
#include <algorithm>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>

#ifdef __MTA__
#include <machine/runtime.h>
#include <luc/luc_common.h>
#include <snapshot/client.h>
#include <sys/mta_task.h>


typedef int int64;
typedef unsigned uint64;
#else
#include <sys/time.h>

#include <iomanip>
#include <cstdint>
#include <iostream>
#include <fstream>
typedef int64_t int64;
typedef uint64_t uint64;

#include <unordered_map>
#include <vector>
#include <limits>
#endif

#include "io_util.h"
#include "hash.h"
#include "radish_utils.h"
#include "strings.h"
#include "timing.h"

// ----------------------------------------------

In [4]:
accumuloconn.setTableProperty('public_adhoc_netflow','table.custom.scheme','[("TotBytes","LONG_TYPE"),("StartTime","STRING_TYPE"),("SrcAddr","STRING_TYPE"),("DstAddr","STRING_TYPE"),("RATE","DOUBLE_TYPE"),("Dur","DOUBLE_TYPE"),("Dir","STRING_TYPE"),("Proto","STRING_TYPE"),("Sport","STRING_TYPE"),("Dport","STRING_TYPE"),("State","STRING_TYPE"),("sTos","LONG_TYPE"),("dTos","LONG_TYPE"),("TotPkts","LONG_TYPE"),("SrcBytes","LONG_TYPE"),("Label","STRING_TYPE")]')
accumuloconn.setTableProperty('public_adhoc_netflow','table.custom.howPartitioned','["TotBytes", "StartTime"]')