# Preprocessing using Beam/Dataflow
## Setup Working Directory

In [1]:
import sys
import os
import pathlib

workingdir=os.getcwd()
#print(workingdir)
d=[d for d in os.listdir(workingdir)]
n=0
while not set(['notebook']).issubset(set(d)):
    workingdir=str(pathlib.Path(workingdir).parents[0])
    #print(workingdir)
    
    
    
    
    d=[d for d in os.listdir(str(workingdir))]
    n+=1
    if n>5:
        break
sys.path.insert(0, workingdir)
os.chdir(workingdir)

## Import libraries

In [10]:
# Import the client library
import os
from google.cloud import bigquery
import apache_beam as beam
import datetime
import subprocess, requests

## Defined GCP env variables

In [11]:
os.environ['PROJECT_ID'] = subprocess.run('gcloud config list project --format "value(core.project)"', shell=True, check=True, stdout=subprocess.PIPE).stdout.decode().replace('\n', '').replace('\r', '')
print(os.environ['PROJECT_ID'])

axa-ch-machine-learning-dev


In [12]:
# bucket storage name
os.environ['BUCKET_NAME']='axa-ch-machine-learning-poc-dev'

In [13]:
try:
    tmp=os.environ['PROJECT_ID']
except:
    print('Env variable PROJECT not defined!') 

try:
    tmp=os.environ['BUCKET_NAME']
except:
    print('Env variable BUCKET_NAME not defined!') 
    
try:    
    tmp=os.environ['GOOGLE_APPLICATION_CREDENTIALS']
except:
    print('Env variable GOOGLE_APPLICATION_CREDENTIALS not defined!') 

try:
    tmp=os.environ['REQUESTS_CA_BUNDLE']
except:
    print('Env variable REQUESTS_CA_BUNDLE not defined!') 

try:
    tmp=os.environ['AXA_CH_CA_BUNDLE']
except:
    print('Env variable AXA_CA_CA_BUNDLE not defined!') 

## Testing everything

In [14]:
# use env variables
use_proxy='Y'
proxies = {
    'https': os.environ['HTTPS_PROXY'],    
    'http': os.environ['HTTP_PROXY']
}  

In [15]:
list_url=['https://www.google.com',
          'http://www.google.com',
          'https://www.example.com',
          'http://www.example.com',
          'https://github.com/j0hannes/cutter-ng']

for url in list_url:
    print('')
    print('trying to access:'+url)
    try:
        if use_proxy=='N':
            r = requests.get(url)
        else:
            # SSL deactivated
            #r = requests.get(url,proxies=proxies,verify=False)
            r = requests.get(url,proxies=proxies,verify=True)
            
        if r.status_code == requests.codes.ok:
            print('=>OK',r.headers['content-type'])
        else:
            # 407 Proxy Authentication Required
            print ('=> ??', r.status_code)
    except Exception as inst:
        print('=>FAILED')
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        print(inst)


trying to access:https://www.google.com
=>OK text/html; charset=ISO-8859-1

trying to access:http://www.google.com
=>OK text/html; charset=ISO-8859-1

trying to access:https://www.example.com
=>OK text/html; charset=UTF-8

trying to access:http://www.example.com
=>OK text/html; charset=UTF-8

trying to access:https://github.com/j0hannes/cutter-ng
=>OK text/html; charset=utf-8


## Extracting some data from BigQuery

In [16]:
client = bigquery.Client()

query = """
SELECT
*
FROM
`axa-ch-machine-learning-dev.test.schema_stackoverflow`
"""
df = client.query(query).to_dataframe()
print(len(df))
df.head(20)

20


Unnamed: 0,table_catalog,table_schema,table_name,column_name,field_path,data_type,description
0,bigquery-public-data,stackoverflow,stackoverflow_posts,comment_count,comment_count,INT64,
1,bigquery-public-data,stackoverflow,stackoverflow_posts,post_type_id,post_type_id,INT64,
2,bigquery-public-data,stackoverflow,stackoverflow_posts,parent_id,parent_id,INT64,
3,bigquery-public-data,stackoverflow,stackoverflow_posts,view_count,view_count,INT64,
4,bigquery-public-data,stackoverflow,stackoverflow_posts,answer_count,answer_count,INT64,
5,bigquery-public-data,stackoverflow,stackoverflow_posts,owner_user_id,owner_user_id,INT64,
6,bigquery-public-data,stackoverflow,stackoverflow_posts,id,id,INT64,
7,bigquery-public-data,stackoverflow,stackoverflow_posts,score,score,INT64,
8,bigquery-public-data,stackoverflow,stackoverflow_posts,accepted_answer_id,accepted_answer_id,INT64,
9,bigquery-public-data,stackoverflow,stackoverflow_posts,favorite_count,favorite_count,INT64,


## Preprocessing using Beam/Dataflow

### Define the region

In [17]:
os.environ['REGION'] = 'europe-west1'

In [18]:
!gcloud config set compute/region $REGION

Updated property [compute/region].


In [19]:
# define query table
def create_query():
    
    query = """SELECT * FROM `axa-ch-machine-learning-dev.test.schema_stackoverflow` """
    
    return query

In [20]:
# getting schema
! bq show --format=prettyjson axa-ch-machine-learning-dev:test.schema_stackoverflow  


Welcome to BigQuery! This script will walk you through the 
process of initializing your .bigqueryrc configuration file.

First, we need to set up your credentials if they do not 
already exist.

Credential creation complete. Now we will select a default project.

List of projects:
[
  {
    "#": 1, 
    "friendlyName": "machine-learning-poc-dev", 
    "projectId": "axa-ch-machine-learning-dev"
  }
]
Found only one project, setting axa-ch-machine-learning-dev as the default.

BigQuery configuration complete! Type "bq" to get started.

{
  "creationTime": "1570200638317", 
  "etag": "/Hv4/YKe/f3sCWLopiHN2g==", 
  "id": "axa-ch-machine-learning-dev:test.schema_stackoverflow", 
  "kind": "bigquery#table", 
  "lastModifiedTime": "1570200638317", 
  "location": "US", 
  "numBytes": "1895", 
  "numLongTermBytes": "0", 
  "numRows": "20", 
  "schema": {
    "fields": [
      {
        "mode": "NULLABLE", 
        "name": "table_catalog", 
        "type": "STRING"
      }, 
      {
        "m

In [21]:
table_schema ={"fields": [
      {
        "mode": "NULLABLE", 
        "name": "table_catalog", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "table_schema", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "table_name", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "column_name", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "field_path", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "data_type", 
        "type": "STRING"
      }, 
      {
        "mode": "NULLABLE", 
        "name": "description", 
        "type": "STRING"
      }
    ]
  }

In [22]:
def preprocess(RUNNER):
    """
    Arguments:
        -RUNNER: "DirectRunner" or "DataflowRunner". Specfy to run the pipeline
        locally or on Google Cloud respectively. 
    Side-effects:
        -Creates and executes dataflow pipeline. 
        See https://beam.apache.org/documentation/programming-guide/#creating-a-pipeline
    """
    job_name = "preprocess-mnist" + "-" + datetime.datetime.now().strftime("%y%m%d-%H%M%S")
    print("Launching Dataflow job {} ... hang on".format(job_name))
    OUTPUT_DIR = "gs://{0}/project/stackoverflow/".format(os.environ['BUCKET_NAME'])

    # dictionary of pipeline options
    options = {
        "staging_location": os.path.join(OUTPUT_DIR, "tmp", "staging"),
        "temp_location": os.path.join(OUTPUT_DIR, "tmp"),
        "job_name": job_name,
        "project": os.environ['PROJECT_ID'],
        "runner": RUNNER,
        "region": os.environ['REGION'] 
    }
  
    # instantiate PipelineOptions object using options dictionary
    opts = beam.pipeline.PipelineOptions(flags = [], **options)

    #instantantiate Pipeline object using PipelineOptions
    with beam.Pipeline(options=opts) as p:
            (
                p | "Read from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(
                    # query
                    query = create_query(),
                    # use standard SQL for the above query
                    use_standard_sql = True)
                                                        )
                  #| 'Write to BigQuery' >>  beam.io.WriteToBigQuery(
                  #    # The table name is a required argument for the BigQuery
                  #    table='schema_stackoverflow_beam',
                  #    dataset='test',
                  #    project=os.environ['PROJECT_ID'],
                  #    # Here we use the JSON schema read in from a JSON file.
                  #    # Specifying the schema allows the API to create the table correctly if it does not yet exist.
                  #    schema=table_schema,
                  #    # Creates the table in BigQuery if it does not yet exist.
                  #    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                  #    # Deletes all data in the BigQuery table before writing.
                  #    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
            )
            #p.run().wait_until_finish()
    print("Done")

In [None]:
preprocess("DirectRunner")

Launching Dataflow job preprocess-mnist-191011-152650 ... hang on


 Traceback for above exception (most recent call last):
  File "C:\Users\c311723\.conda\envs\env_data_loss_prev\lib\site-packages\apache_beam\utils\retry.py", line 206, in wrapper
    return fun(*args, **kwargs)
  File "C:\Users\c311723\.conda\envs\env_data_loss_prev\lib\site-packages\apache_beam\io\gcp\bigquery_tools.py", line 261, in get_query_location
    response = self.client.jobs.Insert(request)
  File "C:\Users\c311723\.conda\envs\env_data_loss_prev\lib\site-packages\apache_beam\io\gcp\internal\clients\bigquery\bigquery_v2_client.py", line 342, in Insert
    upload=upload, upload_config=upload_config)
  File "C:\Users\c311723\.conda\envs\env_data_loss_prev\lib\site-packages\apitools\base\py\base_api.py", line 729, in _RunMethod
    http, http_request, **opts)
  File "C:\Users\c311723\.conda\envs\env_data_loss_prev\lib\site-packages\apitools\base\py\http_wrapper.py", line 356, in MakeRequest
    max_retry_wait, total_wait_sec))
  File "C:\Users\c311723\.conda\envs\env_data_loss_p