In [1]:
import dask
import dask.bag as db
import dask.dataframe as dd
from dask.distributed import get_worker
from dask.distributed import Client, LocalCluster, progress


In [2]:
# cluster.close()

In [3]:
# client = Client(processes=True)
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='4GB', processes=True)
client = Client(cluster)
# client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:44403  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.00 GB


In [4]:
# client.close()
# cluster.close()

In [5]:
import glob
import pickle
import spacy
import pandas as pd

from wb_nlp import dir_manager
from wb_nlp.cleaning import cleaner

In [6]:
SOURCE_DIR = dir_manager.get_data_dir('raw', 'sample_data', 'TXT_SAMPLE')
TARGET_DIR = dir_manager.get_data_dir('preprocessed', 'sample_data', 'TXT_SAMPLE')

# Create TARGET_DIR if not available
if not os.path.isdir(TARGET_DIR):
    os.makedirs(TARGET_DIR)

f = [f for f in glob.glob(os.path.join(SOURCE_DIR, '*.txt'))]
len(f), f[0].replace(SOURCE_DIR, TARGET_DIR)

(107, '/workspace/data/preprocessed/sample_data/TXT_SAMPLE/wb_13720575.txt')

In [7]:
def dask_clean_file(f):

    worker = get_worker()

    try:
        nlp = worker.nlp
        lda_cleaner = worker.lda_cleaner
    except AttributeError:
        nlp = spacy.load('en_core_web_sm')
        lda_cleaner = cleaner.LDACleaner()
        worker.nlp = nlp
        worker.lda_cleaner = lda_cleaner

    with open(f, 'rb') as fl:
        text = fl.read().decode('utf-8', errors='ignore')
        tokens = lda_cleaner.get_tokens_and_phrases(text)

        f = f.replace(SOURCE_DIR, TARGET_DIR)

        with open(f + '.pickle', 'wb') as ff:
            pickle.dump(tokens, ff)

    return True

In [8]:
b = db.from_sequence(f, npartitions=10)
file_reader = b.map(dask_clean_file)

In [10]:
%%time
res = file_reader.all().compute()
# res = file_reader.compute()

CPU times: user 35 s, sys: 23 s, total: 58 s
Wall time: 6min 35s


In [11]:
with open(os.path.join(TARGET_DIR, 'wb_725385.txt.pickle'), 'rb') as ff:
    data = pickle.load(ff)

In [12]:
len(data['tokens']), len(data['phrases'])

(45512, 13294)

In [13]:
data['tokens']

['public',
 'resource',
 'structural',
 'adjustment',
 'volume',
 'volume',
 'recent',
 'economic',
 'development',
 'term',
 'outlook',
 'public',
 'resource',
 'management',
 'part',
 'sectoral',
 'adjustment',
 'policy',
 'issue',
 'investment',
 'program',
 'country',
 'official',
 'use',
 'only',
 'document',
 'document',
 'distribution',
 'recipient',
 'performance',
 'official',
 'duty',
 'content',
 'authorization',
 'currency',
 'equivalent',
 'baht',
 'baht',
 'fiscal',
 'year',
 'weight',
 'measure',
 'barrel',
 'day',
 'mmscfd',
 'day',
 'tcf',
 'metric',
 'barrel',
 'approx',
 'ton',
 'fuel',
 'oil',
 'british',
 'thermal',
 'unit',
 'equal',
 'toe',
 'ton',
 'oil',
 'equivalent',
 'kcal',
 'btus',
 'kwh',
 'primary',
 'electricity',
 'thermal',
 'replacement',
 'value',
 'natural',
 'gas',
 'lignite',
 'acre',
 'ofiicial',
 'use',
 'only',
 'report',
 'mission',
 'mission',
 'foreign',
 'public',
 'debt',
 'public',
 'expenditure',
 'energy',
 'transport',
 'energy',
 'ag

# Train gensim phraser

In [9]:
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

In [12]:
def generate_cleaned_data(file_list):

    for f in file_list:
        with open(os.path.join(f.replace(SOURCE_DIR, TARGET_DIR) + '.pickle'), 'rb') as ff:
            data = pickle.load(ff)
            yield(data['tokens'])



In [13]:
%%time

corpus_generator = generate_cleaned_data(f)
bigram = Phrases(corpus_generator, min_count=1)
bigram_phraser = Phraser(bigram)
del(bigram)

CPU times: user 13 s, sys: 651 ms, total: 13.7 s
Wall time: 14.4 s


In [14]:
%%time

corpus_generator = generate_cleaned_data(f)
trigram = Phrases(bigram_phraser[corpus_generator], min_count=1)
trigram_phraser = Phraser(trigram)
del(trigram)

CPU times: user 22.5 s, sys: 1.15 s, total: 23.7 s
Wall time: 24.2 s


# Compare gensim and custom phrasers

In [None]:
phrased_doc = trigram_phraser[data['tokens']]

In [23]:
common_phrases = list(set(data['phrases']).intersection(phrased_doc))
common_phrases[:10]

['financial_viability',
 'foreign_currency',
 'lpg_project',
 'fertility_decline',
 'universal_access',
 'incentive_structure',
 'immediate_impact',
 'educational_material',
 'great_care',
 'further_investigation']

In [15]:
# client.restart()



0,1
Client  Scheduler: tcp://127.0.0.1:34733  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB
