In [1]:
import dask
import dask.bag as db
import dask.dataframe as dd
from dask.distributed import get_worker
from dask.distributed import Client, LocalCluster, progress


In [2]:
# cluster.close()

In [3]:
# client = Client(processes=False)
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='6GB', processes=True)
client = Client(cluster)
# client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:33489  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 24.00 GB


In [4]:
# client.close()
# cluster.close()

In [5]:
import glob
import pickle
import spacy
import pandas as pd

from wb_nlp import dir_manager, config as conf
from wb_nlp.cleaning import cleaner

In [6]:
SOURCE_DIR = dir_manager.get_data_dir('raw', 'sample_data', 'TXT_SAMPLE')
TARGET_DIR = dir_manager.get_data_dir('preprocessed', 'sample_data', 'TXT_SAMPLE')

# Create TARGET_DIR if not available
if not os.path.isdir(TARGET_DIR):
    os.makedirs(TARGET_DIR)

f = [f for f in glob.glob(os.path.join(SOURCE_DIR, '*.txt'))]
len(f), f[0].replace(SOURCE_DIR, TARGET_DIR)

(107, '/workspace/data/preprocessed/sample_data/TXT_SAMPLE/wb_13720575.txt')

In [7]:
def dask_clean_file(f):

    worker = get_worker()

    try:
        nlp = worker.nlp
        lda_cleaner = worker.lda_cleaner
    except AttributeError:
        nlp = spacy.load("en_core_web_sm", exclude=["parser"])
        lda_cleaner = cleaner.LDACleaner(conf.get_config(conf.default_config))
        worker.nlp = nlp
        worker.lda_cleaner = lda_cleaner

    with open(f, "rb") as fl:
        text = fl.read().decode("utf-8", errors="ignore")
        tokens = lda_cleaner.get_tokens_and_phrases(text)

        f = f.replace(SOURCE_DIR, TARGET_DIR)

        with open(f + ".pickle", "wb") as ff:
            pickle.dump(tokens, ff)

    return True

In [15]:
import importlib
importlib.reload(cleaner)

<module 'wb_nlp.cleaning.cleaner' from '/workspace/src/wb_nlp/cleaning/cleaner.py'>

In [16]:
nlp = spacy.load("en_core_web_sm", disable=["parser"])
# Fine to increase max_length since we're not using the NER and the parser.
# nlp.max_length = int(1e9)
lda_cleaner = cleaner.LDACleaner(conf.get_config(conf.default_config))


def joblib_clean_file(f):

    with open(f, "rb") as fl:
        text = fl.read().decode("utf-8", errors="ignore")
        tokens = lda_cleaner.get_tokens_and_phrases(text)

        f = f.replace(SOURCE_DIR, TARGET_DIR)

        with open(f + ".pickle", "wb") as ff:
            pickle.dump(tokens, ff)

    return True

In [9]:
%%time
with open(os.path.join(SOURCE_DIR, 'wb_725385.txt'), 'rb') as l:
    text = l.read().decode('utf-8', errors='ignore')
# tokens = lda_cleaner.get_tokens_and_phrases(text)


CPU times: user 6.8 ms, sys: 6.59 ms, total: 13.4 ms
Wall time: 32.7 ms


In [19]:
%%time

# doc = nlp(text)
doc = lda_cleaner.text_to_doc(text)

CPU times: user 16.6 s, sys: 14.9 s, total: 31.6 s
Wall time: 30.2 s


In [28]:
for t in doc:
    if t.pos_ == 'SPACE':
        continue
    print(t.lower_, t.lemma_, t.pos_, t.ent_type_)
    if t.i > 300:
        break

report report VERB 
no no INTJ 
. . NOUN 
4366-th 4366-th NUM CARDINAL
f F PROPN 
thailand Thailand PROPN 
naanaging naanage VERB 
public public ADJ 
resources resource NOUN 
for for ADP 
structural Structural PROPN 
adjustment Adjustment PROPN 
( ( PUNCT 
in in ADP 
two two NUM CARDINAL
volumes Volumes PROPN 
) ) PUNCT 
volume volume NOUN 
i -PRON- PRON 
part part NOUN 
i -PRON- PRON 
recent recent ADJ 
economic economic ADJ 
developments Developments PROPN 
and and CCONJ 
m1edium m1edium NUM 
term Term PROPN 
outlook Outlook PROPN 
part Part PROPN 
ii II PROPN 
. . PUNCT 
public Public PROPN 
resource Resource PROPN 
management Management PROPN 
and and CCONJ 
planning Planning PROPN 
part Part PROPN 
ill. Illinois PROPN 
sectoral Sectoral PROPN 
adjustment Adjustment PROPN 
: : PUNCT 
policy Policy PROPN 
issues Issues PROPN 
and and CCONJ 
investment Investment PROPN 
programs Programs PROPN 
august August PROPN DATE
31 31 NUM DATE
, , PUNCT DATE
1983 1983 NUM DATE
country Country 

In [21]:
%%time
tokens = lda_cleaner.get_clean_tokens(text)

CPU times: user 20.8 s, sys: 8.01 s, total: 28.9 s
Wall time: 28.7 s


In [23]:
cleaner.respelling.wordninja.split('maybegood')

['maybe', 'good']

In [22]:
tokens[:100]

['report',
 'volume',
 'part',
 'recent',
 'economic',
 'development',
 'document',
 'restricted',
 'distribution',
 'maybe',
 'recipient',
 'lying',
 'performance',
 'official',
 'duty',
 'content',
 'otherwise',
 'disclose',
 'authorization',
 'currency',
 'baht',
 'day',
 'day',
 'cf',
 'metric',
 'ton',
 'bbl',
 'approx',
 'pert',
 'fuel',
 'oil',
 'equal',
 'cal',
 'toe',
 'oil',
 'equivalent',
 'primary',
 'electricity',
 'thermal',
 'replacement',
 'value',
 'natural',
 'gas',
 'lignite',
 'acre',
 'prepare',
 'mission',
 'visit',
 'consist',
 'industry',
 'consultant',
 'consultant',
 'join',
 'support',
 'mission',
 'lead',
 'identification',
 'mission',
 'overlap',
 'economic',
 'mission',
 'term',
 'timing',
 'staffing',
 'closely',
 'associate',
 'work',
 'visit',
 'assist',
 'writing',
 'mission',
 'furthermore',
 'greatly',
 'assist',
 'particular',
 'official',
 'draft',
 'version',
 'discuss',
 'mission',
 'lead',
 'include',
 'discussion',
 'assistive',
 'document',
 '

In [15]:
tokens[:100]

['report',
 'Thailand',
 'managing',
 'Public',
 'Resources',
 'Structural',
 'Adjustment',
 'Volumes',
 'volume',
 'part',
 'recent',
 'economic',
 'development',
 'um',
 'Term',
 'Outlook',
 'Public',
 'Resource',
 'Management',
 'Planning',
 'pectoral',
 'Adjustment',
 'Policy',
 'Issues',
 'Investment',
 'Programs',
 'Programs',
 'Department',
 'East',
 'Asia',
 'document',
 'restricted',
 'distribution',
 'maybe',
 'recipient',
 'lying',
 'performance',
 'official',
 'duty',
 'content',
 'otherwise',
 'disclose',
 'authorization',
 'currency',
 'EQUIVALENTS',
 'Baht',
 'baht',
 'FISCAL',
 'YEAR',
 'WEIGHTS',
 'MEASURES',
 'day',
 'day',
 'cf',
 'Mt',
 'metric',
 'ton',
 'bbl',
 'approx',
 'pert',
 'fuel',
 'oil',
 'Btu',
 'equal',
 'cal',
 'toe',
 'oil',
 'equivalent',
 'primary',
 'electricity',
 'thermal',
 'replacement',
 'value',
 'natural',
 'gas',
 'lignite',
 'acre',
 'ha',
 'prepare',
 'mission',
 'visit',
 'hie',
 'Mission',
 'consist',
 'Mission',
 'Chief',
 'Foreign',
 

In [10]:
from joblib import Parallel, delayed
import joblib

In [11]:
# %%time

# # with Parallel(n_jobs=4, backend='dask') as parallel:
# #     res = parallel(delayed(joblib_clean_file)(i) for i in f)

# with joblib.parallel_backend('dask'):
#     res = Parallel(verbose=10)(delayed(joblib_clean_file)(i) for i in f)

# res = all(res)  # set([i for i in itertools.chain.from_iterable(res)])
# # len(res)
# res

In [12]:
b = db.from_sequence(f, npartitions=8)
file_reader = b.map(dask_clean_file)

In [13]:
%%time
res = file_reader.all().compute()
# res = file_reader.compute()



KilledWorker: ("('dask_clean_file-all-part-6d6513d7a6e5ef3a9764ef5cf528b2a1', 3)", <Worker 'tcp://127.0.0.1:42039', name: 0, memory: 0, processing: 1>)

In [10]:
%%time
res = file_reader.all().compute()
# res = file_reader.compute()

CPU times: user 35 s, sys: 23 s, total: 58 s
Wall time: 6min 35s


In [11]:
with open(os.path.join(TARGET_DIR, 'wb_725385.txt.pickle'), 'rb') as ff:
    data = pickle.load(ff)

In [12]:
len(data['tokens']), len(data['phrases'])

(45512, 13294)

In [13]:
data['tokens']

['public',
 'resource',
 'structural',
 'adjustment',
 'volume',
 'volume',
 'recent',
 'economic',
 'development',
 'term',
 'outlook',
 'public',
 'resource',
 'management',
 'part',
 'sectoral',
 'adjustment',
 'policy',
 'issue',
 'investment',
 'program',
 'country',
 'official',
 'use',
 'only',
 'document',
 'document',
 'distribution',
 'recipient',
 'performance',
 'official',
 'duty',
 'content',
 'authorization',
 'currency',
 'equivalent',
 'baht',
 'baht',
 'fiscal',
 'year',
 'weight',
 'measure',
 'barrel',
 'day',
 'mmscfd',
 'day',
 'tcf',
 'metric',
 'barrel',
 'approx',
 'ton',
 'fuel',
 'oil',
 'british',
 'thermal',
 'unit',
 'equal',
 'toe',
 'ton',
 'oil',
 'equivalent',
 'kcal',
 'btus',
 'kwh',
 'primary',
 'electricity',
 'thermal',
 'replacement',
 'value',
 'natural',
 'gas',
 'lignite',
 'acre',
 'ofiicial',
 'use',
 'only',
 'report',
 'mission',
 'mission',
 'foreign',
 'public',
 'debt',
 'public',
 'expenditure',
 'energy',
 'transport',
 'energy',
 'ag

# Train gensim phraser

In [9]:
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

In [12]:
def generate_cleaned_data(file_list):

    for f in file_list:
        with open(os.path.join(f.replace(SOURCE_DIR, TARGET_DIR) + '.pickle'), 'rb') as ff:
            data = pickle.load(ff)
            yield(data['tokens'])



In [13]:
%%time

corpus_generator = generate_cleaned_data(f)
bigram = Phrases(corpus_generator, min_count=1)
bigram_phraser = Phraser(bigram)
del(bigram)

CPU times: user 13 s, sys: 651 ms, total: 13.7 s
Wall time: 14.4 s


In [14]:
%%time

corpus_generator = generate_cleaned_data(f)
trigram = Phrases(bigram_phraser[corpus_generator], min_count=1)
trigram_phraser = Phraser(trigram)
del(trigram)

CPU times: user 22.5 s, sys: 1.15 s, total: 23.7 s
Wall time: 24.2 s


# Compare gensim and custom phrasers

In [None]:
phrased_doc = trigram_phraser[data['tokens']]

In [23]:
common_phrases = list(set(data['phrases']).intersection(phrased_doc))
common_phrases[:10]

['financial_viability',
 'foreign_currency',
 'lpg_project',
 'fertility_decline',
 'universal_access',
 'incentive_structure',
 'immediate_impact',
 'educational_material',
 'great_care',
 'further_investigation']

In [15]:
# client.restart()



0,1
Client  Scheduler: tcp://127.0.0.1:34733  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB
