# Environment

In [1]:
import data.beam_debug as debug, src.preprocessing.test_dofn as dofn
import time, google.cloud.bigquery as bigquery

# Data

In [2]:
client = bigquery.Client()

In [3]:
query_minute_1 = """
SELECT
  id,
  title,
  body
FROM
  stackoverflow.posts_p1
WHERE
  id IN {minute_1}""".format(
    minute_1=debug.minute_1
)

query_minute_5 = """
SELECT
  id,
  title,
  body
FROM
  stackoverflow.posts_p1
WHERE
  id IN {minute_5}""".format(
    minute_5=debug.minute_5
)

In [4]:
df_minute_1 = client.query(query_minute_1).to_dataframe()

In [5]:
df_minute_5 = client.query(query_minute_5).to_dataframe()

# Instantianting DoFN Object

In [6]:
preprocessing_object = dofn.NLP()

# Testing Execution Speed (t >= 300s With DataFlow)

In [7]:
df_minute_5.head()

Unnamed: 0,id,title,body
0,35438993,,<p>You specify <code>x in range(5)</code> so i...
1,5674247,,<p>You should try to accomplish this (an Accor...
2,24131283,Android ListFragment in Tabs not showing on ba...,<p>I have a (relatively) simple drill down app...
3,11432394,,<p>In MVC you wouldn't normally want to be wor...
4,32584268,,<p>Try this...</p> <pre><code>&lt;?xml version...


In [8]:
body = df_minute_5['body'].tolist()
title = df_minute_5['title'].tolist()

In [9]:
times = []

In [10]:
for body_element, title_element in zip(body, title):
    start_time = time.time()
    
    temp1 = preprocessing_object.nlp(body_element)
    temp2 = preprocessing_object.nlp(title_element)
    
    end_time = time.time()
    total_time = end_time-start_time
    
    times.append(total_time)

ValueError: [E167] Unknown morphological feature: 'Person' (2313063860588076218). This can happen if the tagger was trained with a different set of morphological features. If you're using a pretrained model, make sure that your models are up to date:
python -m spacy validate

In [11]:
times

[]

In [13]:
# Testing Memory Leakage

In [18]:
import random
import spacy
import plac
import psutil
import sys
import objgraph
import gc

gc.set_debug(gc.DEBUG_SAVEALL)

def load_data():
    return ["This is a fake test document number %d."%i for i in random.sample(range(100_000), 10_000)]


# def print_memory_usage():
#     print(objgraph.show_growth(limit=5))
#     print("GC count="+str(gc.get_count()))
#     gc.collect()

class ReloadableNlp:
    def __init__(self, model, reload=1000):
        self.model = model
        self.reload = reload
        self.count = 0
        self.nlp = spacy.load(model)

    def get_nlp(self):
        self.count += 1
        if self.count % 1_000 == 0:
            del self.nlp
            gc.collect()
            self.nlp = spacy.load(self.model)
        return self.nlp



def parse_texts(reloadable, texts, iterations=1_000):
    for i in range(iterations):
        for doc in reloadable.get_nlp().pipe(texts, cleanup=True):
            yield doc

@plac.annotations(
    iterations=("Number of iterations", "option", "n", int),
    model=("spaCy model to load", "positional", None, str)
)
def main(model='en_core_web_sm', iterations=1_000):
    texts = load_data()
    reloadable = ReloadableNlp(model)
    for i, doc in enumerate(parse_texts(reloadable, texts, iterations=iterations)):
        if i % 10_000 == 0:
            print(i, psutil.virtual_memory().percent)
            #print_memory_usage()
            sys.stdout.flush()




In [19]:
main()

0 24.2
10000 24.4
20000 24.4
30000 24.4
40000 24.5
50000 24.5
60000 24.4
70000 24.3
80000 24.3
90000 24.3
100000 24.3
110000 24.3
120000 24.3
130000 24.4
140000 24.4
150000 24.3
160000 24.3
170000 24.4
180000 24.6
190000 24.1
200000 24.1
210000 24.1
220000 24.1
230000 24.1
240000 24.2
250000 24.2
260000 24.2
270000 24.1
280000 24.0


KeyboardInterrupt: 

In [16]:
! python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/.conda-env/env_nlp_text_class/lib/python3.6/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                       
package   en-core-web-sm   en_core_web_sm   [38;5;1m2.1.0[0m   --> 2.2.5

[1m
Use the following commands to update the model packages:
python -m spacy download en_core_web_sm



In [17]:
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0MB)
[K     |████████████████████████████████| 12.0MB 630kB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.5-cp36-none-any.whl size=12011741 sha256=3afda6b7e41e103cdf11232ec9b4b6904075287c975c75f4e8e122077a49aae6
  Stored in directory: /tmp/pip-ephem-wheel-cache-_uftc5ni/wheels/6a/47/fb/6b5a0b8906d8e8779246c67d4658fd8a544d4a03a75520197a
Successfully built en-core-web-sm
[31mERROR: nlp-text-classification-with-gcp 0.1 has requirement spacy==2.1.8, but you'll have spacy 2.2.3 which is incompatible.[0m
Installing collected packages: e