# NLP Preprocessing using Beam/Dataflow
## Setup Working Directory

In [1]:
import sys
import os
import pathlib

workingdir=os.getcwd()
#print(workingdir)
d=[d for d in os.listdir(workingdir)]
n=0
while not set(['notebook']).issubset(set(d)):
    workingdir=str(pathlib.Path(workingdir).parents[0])
    #print(workingdir)
    
    
    
    
    d=[d for d in os.listdir(str(workingdir))]
    n+=1
    if n>5:
        break
sys.path.insert(0, workingdir)
os.chdir(workingdir)

## Import libraries

In [41]:
import os
import logging
import subprocess
import datetime
import subprocess, requests
import apache_beam as beam
from google.cloud import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
import en_core_web_sm
import bs4
import string

## Defined GCP env variables

In [3]:
# get all variables here
os.environ['PROJECT_ID'] =  subprocess.run('gcloud config list project --format "value(core.project)"',
                                             shell=True, check=True,
                                             stdout=subprocess.PIPE).stdout.decode().replace('\n', '').replace('\r', '')

os.environ['REGION'] = subprocess.run('gcloud config get-value compute/region  2> /dev/null',
                                      shell=True, check=True,
                                      stdout=subprocess.PIPE).stdout.decode().replace('\n', '').replace('\r', '')

In [4]:
try:
    tmp=os.environ['PROJECT_ID']
except:
    print('Env variable PROJECT not defined!') 

try:
    tmp=os.environ['BUCKET_NAME']
except:
    print('Env variable BUCKET_NAME not defined!') 

try:
    tmp=os.environ['REGION']
except:
    print('Env variable REGION not defined!') 

try:    
    tmp=os.environ['GOOGLE_APPLICATION_CREDENTIALS']
except:
    print('Env variable GOOGLE_APPLICATION_CREDENTIALS not defined!') 

try:
    tmp=os.environ['REQUESTS_CA_BUNDLE']
except:
    print('Env variable REQUESTS_CA_BUNDLE not defined!') 

try:
    tmp=os.environ['AXA_CH_CA_BUNDLE']
except:
    print('Env variable AXA_CA_CA_BUNDLE not defined!') 

Env variable GOOGLE_APPLICATION_CREDENTIALS not defined!
Env variable REQUESTS_CA_BUNDLE not defined!
Env variable AXA_CA_CA_BUNDLE not defined!


## Creating a DoFn Object

In [5]:
class Split(beam.DoFn):
    def process(self, element):
        self.id, self.title, self.body, self.tags = element.split(",")

        return [{
            'id': self.id,
            'title': self.title,
            'body': self.body,
            'tags': self.tags
        }]

In [36]:
class CleanText(beam.DoFn):
    def __init__(self):
        self.spacy = en_core_web_sm.load()
        
    def __decode_html(self, input_str: str) -> str:
        self.soup = bs4.BeautifulSoup(input_str, 'html.parser')
        self.output = self.soup.text
        return self.output

    def __nlp(self, input_str: str) -> list:
        self.doc = self.spacy(input_str)
        self.stopwords = list(string.punctuation + string.digits) + ['-pron-']
        self.output = [token.lemma_.lower() for token in self.doc if not token.is_stop 
                  and token.lemma_.lower() not in self.stopwords]
        return self.output

    def __split_tags(self, tags: str) -> list:
        return tags.split('|')

    def process(self, element):
        self.title_array = self.__nlp(element['title'])
        self.body_decoded = self.__decode_html(element['tags'])
        self.body_array = self.__nlp(self.body_decoded)
        self.tag_array = self.__split_tags(element['tags'])
        
        return [{'id': int(element['id']), 
                 'title': self.title_array, 
                 'body': self.body_array, 
                 'tags': self.tag_array}]

In [37]:
# define query table
def create_query():
    query = """
    SELECT
      id,
      title,
      body,
      tags
    FROM
      `bigquery-public-data.stackoverflow.stackoverflow_posts`
    LIMIT 100
    """

    return query

In [38]:
table_schema = {'fields': [
    {'name': 'id', 'type': 'NUMERIC', 'mode': 'REQUIRED'},
    {'name': 'title', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'body', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'tags', 'type': 'ARRAY', 'mode': 'NULLABLE'},
]}
new_table = 'nlp_text_classification.stackoverflow_posts_preprocessed'

## Preprocessing using Beam/Dataflow

In [52]:
def preprocess():
    """
    Arguments:
        -RUNNER: "DirectRunner" or "DataflowRunner". Specfy to run the pipeline locally or on Google Cloud respectively.
    Side-effects:
        -Creates and executes dataflow pipeline.
        See https://beam.apache.org/documentation/programming-guide/#creating-a-pipeline
    """
    job_name = 'test-stackoverflow' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
    project = os.environ['PROJECT_ID']
    region = os.environ['REGION']
    output_dir = "gs://{0}/stackoverflow/".format(os.environ['BUCKET_NAME'])
    local_file = 'data/beam_test.csv'

    # options    
    options = PipelineOptions()
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project =  project
    google_cloud_options.job_name =  job_name
    google_cloud_options.region = region
    google_cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
    google_cloud_options.temp_location = os.path.join(output_dir, 'tmp')
    # done by command line
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # instantantiate Pipeline object using PipelineOptions
    print('Launching Dataflow job {} ... hang on'.format(job_name))

    p = beam.Pipeline(options=options)    
    table = p | 'Read from BigQuery' >> beam.io.Read(beam.io.BigQuerySource(
        # query
        query=create_query(),
        # use standard SQL for the above query
        use_standard_sql=True))
    clean_text = table | 'Clean Text' >> beam.ParDo(CleanText())
    #clean_text | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
    #    # The table name is a required argument for the BigQuery
    #    table='test_stackoverflow_pp',
    #    dataset='test',
    #    project=project,
    #    # Here we use the JSON schema read in from a JSON file.
    #    # Specifying the schema allows the API to create the table correctly if it does not yet exist.
    #    schema=table_schema,
    #    # Creates the table in BigQuery if it does not yet exist.
    #    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
    #    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
    
    #output = p | 'Read from BigQuery' >> beam.io.Read(beam.io.BigQuerySource(
    #    # query
    #    query=create_query(),
    #    # use standard SQL for the above query
    #    use_standard_sql=True)
    #    )
    #output | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
    #    # The table name is a required argument for the BigQuery
    #    table='test_stackoverflow_beam',
    #    dataset='test',
    #    project=project,
    #    # Here we use the JSON schema read in from a JSON file.
    #    # Specifying the schema allows the API to create the table correctly if it does not yet exist.
    #    schema=table_schema,
    #    # Creates the table in BigQuery if it does not yet exist.
    #    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
    #    # Deletes all data in the BigQuery table before writing.
    #    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    #    # not needed, from with clause

    if options.view_as(StandardOptions).runner == 'DataflowRunner':
        print('DataflowRunner')
        p.run()
    else:
        print('Default: DirectRunner')
        result = p.run()
        result.wait_until_finish()
    print('Done')

In [54]:
if __name__ == '__main__':
    #logging.getLogger().setLevel(logging.DEBUG)
    logging.getLogger().setLevel(logging.NOTSET)

    print('Starting main process ...')
    preprocess()

Starting main process ...
Launching Dataflow job test-stackoverflow-191104-121455 ... hang on
DataflowRunner


INFO:root:Starting GCS upload to gs://nlp-text-classification/stackoverflow/tmp/staging/test-stackoverflow-191104-121455.1572869729.334275/pipeline.pb...
INFO:root:Completed GCS upload to gs://nlp-text-classification/stackoverflow/tmp/staging/test-stackoverflow-191104-121455.1572869729.334275/pipeline.pb in 2 seconds.
INFO:root:Downloading source distribution of the SDK from PyPi
INFO:root:Executing command: ['/home/.conda-env/env_nlp_text_class/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmp70msl3z6', 'apache-beam==2.16.0', '--no-deps', '--no-binary', ':all:']
INFO:root:Staging SDK sources from PyPI to gs://nlp-text-classification/stackoverflow/tmp/staging/test-stackoverflow-191104-121455.1572869729.334275/dataflow_python_sdk.tar
INFO:root:Starting GCS upload to gs://nlp-text-classification/stackoverflow/tmp/staging/test-stackoverflow-191104-121455.1572869729.334275/dataflow_python_sdk.tar...
INFO:root:Completed GCS upload to gs://nlp-text-classification/stackoverflow/tmp/st

HttpError: HttpError accessing <https://dataflow.googleapis.com/v1b3/projects/nlp-text-classification/locations//jobs?alt=json>: response: <{'content-type': 'text/html; charset=UTF-8', 'referrer-policy': 'no-referrer', 'content-length': '2393', 'date': 'Mon, 04 Nov 2019 12:16:14 GMT', 'connection': 'close', 'status': '413'}>, content <<!DOCTYPE html>
<html lang=en>
  <meta charset=utf-8>
  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">
  <title>Error 413 (Request Entity Too Large)!!1</title>
  <style>
    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}
  </style>
  <a href=//www.google.com/><span id=logo aria-label=Google></span></a>
  <p><b>413.</b> <ins>That���s an error.</ins>
  <p>Your client issued a request that was too large.
 <script>
  (function() { var c=function(a,d,b){a=a+"=deleted; path="+d;null!=b&&(a+="; domain="+b);document.cookie=a+"; expires=Thu, 01 Jan 1970 00:00:00 GMT"};var g=function(a){var d=e,b=location.hostname;c(d,a,null);c(d,a,b);for(var f=0;;){f=b.indexOf(".",f+1);if(0>f)break;c(d,a,b.substring(f+1))}};var h;if(4E3<unescape(encodeURI(document.cookie)).length){for(var k=document.cookie.split(";"),l=[],m=0;m<k.length;m++){var n=k[m].match(/^\s*([^=]+)/);n&&l.push(n[1])}for(var p=0;p<l.length;p++){var e=l[p];g("/");for(var q=location.pathname,r=0;;){r=q.indexOf("/",r+1);if(0>r)break;var t=q.substring(0,r);g(t);g(t+"/")}"/"!=q.charAt(q.length-1)&&(g(q),g(q+"/"))}h=!0}else h=!1;
h&&setTimeout(function(){if(history.replaceState){var a=location.href;history.replaceState(null,"","/");location.replace(a)}},1E3); })();

</script>
 <ins>That���s all we know.</ins>
>

## Apache Beam and GCP Settings

In [1]:
from setuptools import find_packages
find_packages()

[]

In [None]:
with beam.Pipeline(argv=sys.argv) as p:
    file = p                  | "ReadLocalFile" >> beam.io.ReadFromText(local_file)
    table = file              | "CreateDictionary"  >> beam.ParDo(Split())
    clean_text = table        | "ProcessFields" >> beam.ParDo(CleanText())
    clean_text                | "WriteLocalFile" >> beam.io.WriteToText('data/beam_output.txt')

## GCP Pipeline

In [None]:
query = '''SELECT
  id,
  title,
  body,
  tags
FROM
  bigquery-public-data:stackoverflow.stackoverflow_posts'''

In [None]:
table_schema = {'fields': [
    {'name': 'id', 'type': 'NUMERIC', 'mode': 'REQUIRED'},
    {'name': 'title', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'body', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'tags', 'type': 'ARRAY', 'mode': 'NULLABLE'},
]}
new_table = 'nlp_text_classification.stackoverflow_posts_preprocessed'

In [None]:
with beam.Pipeline(options=pipeline_options) as p:
    table = p                 | "QueryTable" >> beam.io.Read(beam.io.BigQuerySource(query))
    clean_text = table        | "ProcessFields" >> beam.ParDo(CleanText())
    clean_text                | "WriteTable" >> beam.io.WriteToBigQuery(
                                                    new_table,
                                                    schema=table_schema,
                                                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)