PROJECT=datalab-spark
export CLOUDSDK_API_ENDPOINT_OVERRIDES_DATAPROC=https://test-dataproc.sandbox.googleapis.com/

gcloud beta dataproc --project ${PROJECT} clusters create ${USER}-interact --zone us-central1-f --image-version 0.2

blaze build java/com/google/cloud/hadoop/services/opensource/client:client_deploy.jar

java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005 -Xbootclasspath/p:alpn-boot-8.1.3.v20150130.jar -jar ~/client_deploy.jar --service_root_url https://test-dataproc.sandbox.googleapis.com --project ${PROJECT} submit --cluster ${USER}-interact spark-shell


In [20]:
import gcp
import gcp._context
import gcp._util

import sys
import time
import grpc.framework.face.exceptions
from grpc.beta import implementations
from grpc.beta.interfaces import StatusCode
from gcp.spark.bytestream_pb2 import beta_create_ByteStream_stub
from gcp.spark.bytestream_pb2 import ReadRequest
from gcp.spark.bytestream_pb2 import ReadResponse
from gcp.spark.bytestream_pb2 import WriteRequest
from gcp.spark.bytestream_pb2 import WriteResponse
from gcp.spark.bytestream_pb2 import QueryWriteStatusRequest
from gcp.spark.bytestream_pb2 import QueryWriteStatusResponse
from oauth2client.client import GoogleCredentials
from grpc.beta.implementations import ClientCredentials

class ByteStreamClient(object):
  _DEFAULT_HOST = "test-dataproc.sandbox.googleapis.com";
  _DEFAULT_PORT = 443;

  _JOB_ID = 'spark-shell-eaa8578e-8271-4ce8-9a4b-905d962af150'
  _CLUSTER_ID = '2225e423-ce11-4cb1-851c-eb5f3a5de1cd'
  _TIMEOUT_SECONDS = 10
  _SSL_CERT_FILE = '/etc/ssl/certs/ca-certificates.crt'

  def __init__(self):
    self.read_index = 0
    self.write_index = 0
    #self.read()
    
  def read(self):
    if self.read_index == 0:
      data = self.read_bytestream(0, 0)
      self.read_index += len(data)
      return data
    else:
      # If we read next chunk starting from read_index, and there is no extra data,
      # API will throw exception. Therefore starting to read one byte before
      # read_index and adjusting response accordingly.
      data = self.read_bytestream(0, self.read_index - 1)
      self.read_index += (len(data) - 1)
      if len(data) == 0:
        return data
      else:
        return data[1:]
  
  def write(self, data):
    if not data.endswith("\n"):
      data = data + "\n"
    #print(self.write_index)
    if self.write_index == 0:
      self.write_index = self.query_write_status().committed_size
    #print(self.write_index)
    self.write_index = self.write_bytestream(data, self.write_index)
    #print(self.write_index)
    
  def executeScala(self, data):
    self.write(data)
    _max_iterations = 10
    iterations = 0
    response = ""
    while iterations < _max_iterations:
      result = self.read()
      print(result)
      response += result
      if "scala>" in result:
        return response
      time.sleep(0.2)
      iterations+=1
    return response
    
  class MetadataTransformer(object):
      """Callable class to transform metadata for gRPC requests.
      :type client: :class:`.client.Client`
      :param client: The client that owns the cluster. Provides authorization and
                     user agent.
      """

      def __init__(self, client):
          self._credentials = client.credentials
          self._user_agent = 'GoogleCloudDataLab/1.0'

      def __call__(self, ignored_val):
          """Adds authorization header to request metadata."""
          access_token = self._credentials.get_access_token().access_token
          return [
              ('Authorization', 'Bearer ' + access_token),
              ('User-agent', self._user_agent),
          ]

  def get_certs(self):
      """Gets the root certificates.
      .. note::
          This is only called by :func:`make_stub`. For most applications,
          a few gRPC stubs (four total, one for each service) will be created
          when a :class:`.Client` is created. This function will not likely
          be used again while that application is running.
          However, it may be worthwhile to cache the output of this function.
      :rtype: str
      :returns: The root certificates for the current machine.
      """
      with open(ByteStreamClient._SSL_CERT_FILE, mode='rb') as file_obj:
          return file_obj.read()

  def read_bytestream(self, read_limit, read_offset, do_retry = True):
    driverInputResourceUri = u'//test-dataproc.sandbox.googleapis.com/projects/datalab-spark/clusters/%s/jobs/%s/bytestreams/stdin' % (ByteStreamClient._CLUSTER_ID, ByteStreamClient._JOB_ID)
    driverOutputResourceUri = u'//test-dataproc.sandbox.googleapis.com/projects/datalab-spark/clusters/%s/jobs/%s/bytestreams/stdout' % (ByteStreamClient._CLUSTER_ID, ByteStreamClient._JOB_ID)

    custom_metadata_transformer = ByteStreamClient.MetadataTransformer(gcp.Context.default())

    root_certificates = self.get_certs()
    client_credentials = implementations.ssl_client_credentials(root_certificates, private_key=None, certificate_chain=None)

    try:
      channel = implementations.secure_channel(ByteStreamClient._DEFAULT_HOST, ByteStreamClient._DEFAULT_PORT, client_credentials)
      stub = beta_create_ByteStream_stub(channel, metadata_transformer=custom_metadata_transformer)
      readRequest = ReadRequest(resource_name=driverOutputResourceUri, read_limit=read_limit, read_offset=read_offset)
      readResponse = stub.Read(readRequest, ByteStreamClient._TIMEOUT_SECONDS)
      #print(readResponse.result())
      return readResponse.next().data
    except grpc.framework.interfaces.face.face.NetworkError as ex:
      if ex.code == StatusCode.UNAUTHENTICATED and do_retry:
        gcp.Context.default().credentials.refresh(None)
        gcp.Context._global_context = None
        return self.read_bytestream(read_limit, read_offset, do_retry = False)
      print('network error: ex.code:%s, ex.details:%s' % (ex.code, ex.details))
    return None

  def query_write_status(self, do_retry = True):
    driverInputResourceUri = u'//test-dataproc.sandbox.googleapis.com/projects/datalab-spark/clusters/%s/jobs/%s/bytestreams/stdin' % (ByteStreamClient._CLUSTER_ID, ByteStreamClient._JOB_ID)
    custom_metadata_transformer = ByteStreamClient.MetadataTransformer(gcp.Context.default())
    root_certificates = self.get_certs()
    client_credentials = implementations.ssl_client_credentials(root_certificates, private_key=None, certificate_chain=None)
    try:
      channel = implementations.secure_channel(ByteStreamClient._DEFAULT_HOST, ByteStreamClient._DEFAULT_PORT, client_credentials)
      query_write_status_request = QueryWriteStatusRequest(resource_name=driverInputResourceUri)

      stub = beta_create_ByteStream_stub(channel, metadata_transformer=custom_metadata_transformer)
      return stub.QueryWriteStatus(query_write_status_request, ByteStreamClient._TIMEOUT_SECONDS)
    except grpc.framework.interfaces.face.face.NetworkError as ex:
      if ex.code == StatusCode.UNAUTHENTICATED and do_retry:
        gcp.Context.default().credentials.refresh(None)
        gcp.Context._global_context = None
        return self.query_write_status(do_retry = False)
      print('network error: ex.code:%s, ex.details:%s' % (ex.code, ex.details))
    return None

  def write_bytestream(self, string_data, write_offset, do_retry = True):
    driverInputResourceUri = u'//test-dataproc.sandbox.googleapis.com/projects/datalab-spark/clusters/%s/jobs/%s/bytestreams/stdin' % (ByteStreamClient._CLUSTER_ID, ByteStreamClient._JOB_ID)
    driverOutputResourceUri = u'//test-dataproc.sandbox.googleapis.com/projects/datalab-spark/clusters/%s/jobs/%s/bytestreams/stdout' % (ByteStreamClient._CLUSTER_ID, ByteStreamClient._JOB_ID)

    custom_metadata_transformer = ByteStreamClient.MetadataTransformer(gcp.Context.default())

    root_certificates = self.get_certs()
    client_credentials = implementations.ssl_client_credentials(root_certificates, private_key=None, certificate_chain=None)

    try:
      channel = implementations.secure_channel(ByteStreamClient._DEFAULT_HOST, ByteStreamClient._DEFAULT_PORT, client_credentials)
      stub = beta_create_ByteStream_stub(channel, metadata_transformer=custom_metadata_transformer)
      writeRequest = WriteRequest(resource_name=driverInputResourceUri, data = str.encode(string_data), write_offset = write_offset, finish_write = False)

      writeResponse = stub.Write([writeRequest], ByteStreamClient._TIMEOUT_SECONDS)
      return writeResponse.committed_size

    except grpc.framework.interfaces.face.face.NetworkError as ex:
      if ex.code == StatusCode.UNAUTHENTICATED and do_retry:
        gcp.Context.default().credentials.refresh(None)
        gcp.Context._global_context = None
        return self.write_bytestream(string_data, write_offset, do_retry = False)
      print('network error: ex.code:%s, ex.details:%s' % (ex.code, ex.details)) 

byteStreamClient = ByteStreamClient()
print(byteStreamClient.read_bytestream(10, 10))
#print(byteStreamClient.read())
#print(byteStreamClient.write("sqlContext\n"))
#print(byteStreamClient.executeScala("sqlContext"))





2:29:41 IN


In [21]:
print(byteStreamClient.read())

16/02/23 22:29:41 INFO org.spark-project.jetty.server.Server: jetty-8.y.z-SNAPSHOT
16/02/23 22:29:41 INFO org.spark-project.jetty.server.AbstractConnector: Started SocketConnector@0.0.0.0:45303
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.6.0
      /_/

Using Scala version 2.10.5 (OpenJDK 64-Bit Server VM, Java 1.8.0_72-internal)
Type in expressions to have them evaluated.
Type :help for more information.
16/02/23 22:29:49 INFO akka.event.slf4j.Slf4jLogger: Slf4jLogger started
16/02/23 22:29:49 INFO Remoting: Starting remoting
16/02/23 22:29:49 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@10.240.0.7:51132]
16/02/23 22:29:50 INFO org.spark-project.jetty.server.Server: jetty-8.y.z-SNAPSHOT
16/02/23 22:29:50 INFO org.spark-project.jetty.server.AbstractConnector: Started SelectChannelConnector@0.0.0.0:4040
16/02/23 22:29:50 INFO org.apache.hadoop.yarn.cl

In [22]:
print(byteStreamClient.write("sqlContext\n"))

None


In [23]:
print(byteStreamClient.executeScala("sqlContext"))

sqlContext
res8: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@5ebd3708

scala> sqlContext
res9: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@5ebd3708

scala> 
sqlContext
res8: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@5ebd3708

scala> sqlContext
res9: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@5ebd3708

scala> 


In [None]:
print(byteStreamClient.executeScala('val df = sqlContext.read.json("gs://alekseyv-test/people.json")\n'))


In [None]:
print(byteStreamClient.executeScala('df.registerTempTable("people")\n'))

In [26]:
print(byteStreamClient.executeScala('sqlContext.sql("select * from people order by age desc").show()\n'))

sqlContext.sql("select * from people order by age desc").show()
16/02/23 22:41:58 INFO hive.ql.parse.ParseDriver: Parsing command: select * from people order by age desc
16/02/23 22:41:58 INFO hive.ql.parse.ParseDriver: Parse Completed
16/02/23 22:41:58 INFO org.apache.hadoop.mapred.FileInputFormat: Total input paths to process : 1

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+


scala> 
sqlContext.sql("select * from people order by age desc").show()
16/02/23 22:41:58 INFO hive.ql.parse.ParseDriver: Parsing command: select * from people order by age desc
16/02/23 22:41:58 INFO hive.ql.parse.ParseDriver: Parse Completed
16/02/23 22:41:58 INFO org.apache.hadoop.mapred.FileInputFormat: Total input paths to process : 1
+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+


scala> 
