# Environment

## Libraries

In [None]:
import os, pathlib, sys, re, string, spacy, bs4, bs4apache_beam as beam

## Working Directory

In [None]:
workingdir=os.getcwd()
d=[d for d in os.listdir(workingdir)]
n=0
while not set(['notebook']).issubset(set(d)):
    workingdir=str(pathlib.Path(workingdir).parents[0])

    d=[d for d in os.listdir(str(workingdir))]
    n+=1
    if n>5:
        break
sys.path.insert(0, workingdir)
os.chdir(workingdir)

## Configuring spaCy for NLP Operations

In [None]:
! python -m spacy download en_core_web_sm

## Apache Beam and GCP Settings

In [None]:
pipeline_options = beam.options.pipeline_options.PipelineOptions()
gcp_options = beam.options.pipeline_options.GoogleCloudOptions
standard_options = beam.options.pipeline_options.StandardOptions

In [None]:
google_cloud_options = pipeline_options.view_as(gcp_options)
google_cloud_options.project = 'axa-ch-machine-learning-dev'
google_cloud_options.job_name = 'nlp_text_classification_preprocessing'
google_cloud_options.staging_location = 'gs://nlp_text_classification'
google_cloud_options.temp_location = 'gs://nlp_text_classification'
pipeline_options.view_as(standard_options).runner = 'DataflowRunner'

# Creating a DoFn Object

In [None]:
class Split(beam.DoFn):
    def process(self, element):
        self.id, self.title, self.body, self.tags = element.split(",")

        return [{
            'id': self.id,
            'title': self.title,
            'body': self.body,
            'tags': self.tags
        }]

In [None]:
class CleanText(beam.DoFn):
    def __init__(self):
        self.spacy = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        
    def __decode_html(self, input_str: str) -> str:
        self.soup = bs4.BeautifulSoup(input_str, 'html.parser')
        self.output = self.soup.text
        return self.output

    def __nlp(self, input_str: str) -> list:
        self.doc = self.spacy(input_str)
        self.stopwords = list(string.punctuation + string.digits) + ['-pron-']
        self.output = [token.lemma_.lower() for token in self.doc if not token.is_stop 
                  and token.lemma_.lower() not in self.stopwords]
        return self.output

    def __split_tags(self, tags: str) -> list:
        return tags.split('|')

    def process(self, element):
        self.title_array = self.__nlp(element['title'])
        self.body_decoded = self.__decode_html(element['tags'])
        self.body_array = self.__nlp(self.body_decoded)
        self.tag_array = self.__split_tags(element['tags'])
        
        return [{'id': int(element['id']), 
                 'title': self.title_array, 
                 'body': self.body_array, 
                 'tags': self.tag_array}]

# Pipeline

## Local Pipeline

In [None]:
local_file = 'data/beam_test.csv'
if os.path.exists('data/beam_output.txt'):
    os.remove('data/beam_output.txt')

In [None]:
with beam.Pipeline(argv=sys.argv) as p:
    file = p                  | "ReadLocalFile" >> beam.io.ReadFromText(local_file)
    table = file              | "CreateDictionary"  >> beam.ParDo(Split())
    clean_text = table        | "ProcessFields" >> beam.ParDo(CleanText())
    clean_text                | "WriteLocalFile" >> beam.io.WriteToText('data/beam_output.txt')

## GCP Pipeline

In [None]:
query = '''SELECT
  id,
  title,
  body,
  tags
FROM
  bigquery-public-data:stackoverflow.stackoverflow_posts'''

In [None]:
table_schema = {'fields': [
    {'name': 'id', 'type': 'NUMERIC', 'mode': 'REQUIRED'},
    {'name': 'title', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'body', 'type': 'ARRAY', 'mode': 'NULLABLE'},
    {'name': 'tags', 'type': 'ARRAY', 'mode': 'NULLABLE'},
]}
new_table = 'nlp_text_classification.stackoverflow_posts_preprocessed'

In [None]:
with beam.Pipeline(options=pipeline_options) as p:
    table = p                 | "QueryTable" >> beam.io.Read(beam.io.BigQuerySource(query))
    clean_text = table        | "ProcessFields" >> beam.ParDo(CleanText())
    clean_text                | "WriteTable" >> beam.io.WriteToBigQuery(
                                                    new_table,
                                                    schema=table_schema,
                                                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)