In [1]:
import dask
import dask.bag as db
import dask.dataframe as dd
from dask.distributed import get_worker
from dask.distributed import Client, LocalCluster, progress

import itertools

In [2]:
# cluster.close()

In [3]:
# client = Client(processes=True)
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='6GB', processes=True)
client = Client(cluster)
# client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:33611  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 24.00 GB


In [4]:
# client.close()
# cluster.close()

In [3]:
import glob
import pickle
import spacy
import pandas as pd

from wb_nlp import dir_manager
from wb_nlp.cleaning import cleaner

In [4]:
SOURCE_DIR = dir_manager.get_data_dir('raw', 'sample_data', 'TXT_SAMPLE')
TARGET_DIR = dir_manager.get_data_dir('preprocessed', 'sample_data', 'TXT_SAMPLE')

# Create TARGET_DIR if not available
if not os.path.isdir(TARGET_DIR):
    os.makedirs(TARGET_DIR)

f = [f for f in glob.glob(os.path.join(SOURCE_DIR, '*.txt'))]
len(f), f[0].replace(SOURCE_DIR, TARGET_DIR)

(107, '/workspace/data/preprocessed/sample_data/TXT_SAMPLE/wb_13720575.txt')

In [7]:
def dask_get_adverbs(f):

    worker = get_worker()

    try:
        nlp = worker.nlp
    except AttributeError:
        nlp = spacy.load('en_core_web_sm', disable=["ner", "parser"])

        # Fine to increase max_length since we're not using the NER and the parser.
        nlp.max_length = int(1e9)
        worker.nlp = nlp

    with open(f, "rb") as fl:
        text = fl.read().decode("utf-8", errors="ignore")

    doc = nlp(text.lower())

    return sorted({token.lemma_ for token in doc if token.pos_ == "ADV"})

In [8]:
def dask_get_adverbs_from_files(flist):

    worker = get_worker()

    try:
        nlp = worker.nlp
    except AttributeError:
        nlp = spacy.load('en_core_web_sm', disable=["ner", "parser"])

        # Fine to increase max_length since we're not using the NER and the parser.
        nlp.max_length = int(1e9)
        worker.nlp = nlp

    def generate_text(flist):
        for f in flist:
            with open(f, "rb") as fl:
                text = fl.read().decode("utf-8", errors="ignore")

            yield text.lower()  # texts.append(text.lower())

    texts = generate_text(flist)

    # texts = []
    # for f in flist:
    #     with open(f, "rb") as fl:
    #         text = fl.read().decode("utf-8", errors="ignore")

    #     texts.append(text.lower())

    adverbs = set()

    for doc in nlp.pipe(texts):
        adverbs.update({token.lemma_ for token in doc if token.pos_ == "ADV"})

    return adverbs

In [6]:
from joblib import Parallel, delayed


In [5]:
nlp = spacy.load('en_core_web_sm', disable=["ner", "parser"])
# Fine to increase max_length since we're not using the NER and the parser.
nlp.max_length = int(1e9)


def joblib_get_adverbs(f):

    with open(f, "rb") as fl:
        text = fl.read().decode("utf-8", errors="ignore")

    doc = nlp(text.lower())

    return sorted({token.lemma_ for token in doc if token.pos_ == "ADV"})

In [8]:
%%time

with Parallel(n_jobs=4, backend='multiprocessing') as parallel:
    res = parallel(delayed(joblib_get_adverbs)(i) for i in f)

res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 301 ms, sys: 150 ms, total: 451 ms
Wall time: 1min 16s


2974

In [12]:
%%time

b = db.from_sequence(f, npartitions=8)
adverb_extractor = b.map(dask_get_adverbs)

res = adverb_extractor.compute()

res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 8.3 s, sys: 4.85 s, total: 13.1 s
Wall time: 1min 41s


2974

In [9]:
%%time

b = db.from_sequence(f, npartitions=8)
adverb_extractor = b.map(dask_get_adverbs)

res = adverb_extractor.compute()

res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 12.9 s, sys: 7 s, total: 19.9 s
Wall time: 2min 11s


2974

In [9]:
%%time
fi = iter(f)
size = 5
file_lists = [l for l in [list(itertools.islice(fi, size)) for i in range((len(f) // size) + 1)] if len(l) > 0]

b = db.from_sequence(file_lists, npartitions=8)
adverb_extractor = b.map(dask_get_adverbs_from_files)

res = adverb_extractor.compute()

res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 10.4 s, sys: 5.26 s, total: 15.6 s
Wall time: 2min 9s


2974

In [11]:
%%time
fi = iter(f)
size = 5
file_lists = [l for l in [list(itertools.islice(fi, size)) for i in range((len(f) // size) + 1)] if len(l) > 0]

b = db.from_sequence(file_lists, npartitions=8)
adverb_extractor = b.map(dask_get_adverbs_from_files)

res = adverb_extractor.compute()

res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 10.2 s, sys: 5.35 s, total: 15.5 s
Wall time: 1min 59s


2974

In [15]:
client.close()

In [15]:
set([i for i in itertools.chain.from_iterable(res)])

{'"l',
 '):',
 '-1974',
 '-the',
 '13that',
 '16-dec-2016',
 'ably',
 'about',
 'above',
 'abreast',
 'abroad',
 'absolutely',
 'ac',
 'accordingly',
 'accurately',
 'aco._ii],a',
 'acquisitionhigher',
 'across',
 'actively',
 'activitiest',
 'actually',
 'acutely',
 'adc',
 'additionally',
 'adequately',
 'admin-',
 'administratively',
 'admittedly',
 'adverse',
 'adversely',
 'after',
 'afterwards',
 'again',
 'aggressively',
 'ago',
 'ahclýil',
 'ahead',
 'aibandh',
 'alike',
 'all',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'alreety',
 'also',
 'alternatively',
 'altogether',
 'always',
 'amendedand',
 'amkodor',
 'amply',
 'anaconda',
 'andaccesspath',
 'andc',
 'andneedspreviously',
 'andstandard',
 'andthereby',
 'andwere',
 'andwill',
 'animously',
 'anit',
 'annually',
 'antonio',
 'anywhere',
 'apart',
 'apparently',
 'apparertly',
 'appreciably',
 'approachdescribedpreviously',
 'appropriately',
 'approximately',
 'arguably',
 'around',
 'artificially',
 'as'

[['/workspace/data/raw/sample_data/TXT_SAMPLE/wb_13720575.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_29697765.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_11620720.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_6719860.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_30662711.txt'],
 ['/workspace/data/raw/sample_data/TXT_SAMPLE/wb_26181916.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_891710.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_740750.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_724956.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_8640074.txt'],
 ['/workspace/data/raw/sample_data/TXT_SAMPLE/wb_12691447.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_11143237.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_12187862.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_724576.txt',
  '/workspace/data/raw/sample_data/TXT_SAMPLE/wb_739916.txt'],
 ['/workspace/data/raw/sample_data/TXT_SAMPLE/wb_