In [1]:
import glob
import pickle
import spacy
import pandas as pd

from wb_nlp import dir_manager
from wb_nlp.cleaning import cleaner

In [2]:
SOURCE_DIR = dir_manager.get_data_dir('raw', 'sample_data', 'TXT_SAMPLE')
TARGET_DIR = dir_manager.get_data_dir('preprocessed', 'sample_data', 'TXT_SAMPLE')

# Create TARGET_DIR if not available
if not os.path.isdir(TARGET_DIR):
    os.makedirs(TARGET_DIR)

f = [f for f in glob.glob(os.path.join(SOURCE_DIR, '*.txt'))]
len(f), f[0].replace(SOURCE_DIR, TARGET_DIR)

(107, '/workspace/data/preprocessed/sample_data/TXT_SAMPLE/wb_13720575.txt')

In [3]:
from joblib import Parallel, delayed
from collections import Counter
import logging
import time
import json

FORMAT = '%(asctime)-15s\t%(source_file)s\t%(elapsed_time)-8s\t%(message)s'
logging.basicConfig(format=FORMAT, filename='./adverbs_log_file.log', level=logging.INFO)

nlp = spacy.load('en_core_web_sm', disable=["ner", "parser"])
# Fine to increase max_length since we're not using the NER and the parser.
nlp.max_length = int(1e9)


def joblib_get_adverbs(f):
    logger = logging.getLogger('corpus_adverbs')
    logger.setLevel(logging.NOTSET)

    start = time.time()

    with open(f, "rb") as fl:
        text = fl.read().decode("utf-8", errors="ignore")

    doc = nlp(text.lower())

    adv = Counter([token.lemma_ for token in doc if token.pos_ == "ADV"])
    end = time.time()

    payload = dict(
        source_file=f.split('/')[-1],
        elapsed_time=end-start,
    )

    logger.info(json.dumps(adv), extra=payload)

    return True

In [4]:
%%time

with Parallel(n_jobs=4, backend='multiprocessing') as parallel:
    res = parallel(delayed(joblib_get_adverbs)(i) for i in f[:50])

# res = set([i for i in itertools.chain.from_iterable(res)])
len(res)

CPU times: user 149 ms, sys: 74.2 ms, total: 223 ms
Wall time: 30.1 s


50

# Process log file

In [5]:
with open('./adverbs_log_file.log') as fl:
    log = fl.read()

In [6]:
adverbs = [json.loads(l.split('\t')[-1]) for l in log.split('\n') if l]

In [7]:
adverbs[0]

{'entirely': 2,
 'directly': 4,
 'previously': 2,
 'exactly': 1,
 'well': 26,
 'that': 2,
 'is': 2,
 'as': 18,
 'more': 41,
 'easily': 3,
 'effectively': 9,
 'also': 35,
 'how': 7,
 'why': 5,
 'increasingly': 3,
 'clearly': 2,
 'currently': 5,
 'here': 1,
 'just': 1,
 'greatly': 2,
 'particularly': 11,
 'significantly': 3,
 'gratefully': 1,
 'exclusively': 1,
 'implicitly': 1,
 'intimately': 1,
 'fairly': 1,
 'conversely': 1,
 'improperly': 1,
 'most': 10,
 'rarely': 8,
 'recently': 2,
 'furthermore': 4,
 'below': 1,
 'e.g.': 13,
 'equitably': 2,
 'twice': 2,
 'about': 4,
 'dramatically': 4,
 'then': 6,
 'once': 2,
 'again': 2,
 'near': 3,
 'medically': 2,
 'overall': 1,
 'slightly': 2,
 'at': 1,
 'least': 1,
 'nevertheless': 2,
 'however': 24,
 'even': 12,
 'when': 20,
 'only': 13,
 'very': 6,
 'where': 15,
 'moderately': 8,
 'severely': 8,
 'almost': 3,
 'thus': 8,
 'additionally': 1,
 'now': 3,
 'economically': 1,
 'possibly': 1,
 'less': 9,
 'high': 1,
 'aggressively': 2,
 'first':