In [24]:
import pickle
import os
import spacy

import multiprocessing as mp
from functools import partial
from joblib import Parallel, delayed
from tqdm import tqdm
from spacy.util import minibatch
from spacy.attrs import LEMMA
from multiprocessing import Process
import time

nlp = spacy.load("en_core_web_sm", disable=[#"tagger",
    #"parser",
    #"ner"
])

In [2]:
current_path = os.path.dirname(os.path.abspath("__file__"))
newsgroups_train_data_loc = f"{current_path}/../data/raw/20_newsgroups/train_data.pkl"
newsgroups_test_data_loc = f"{current_path}/../data/raw/20_newsgroups/test_data.pkl"

In [25]:
for ele in nlp("Autonomous cars shift insurance liability toward manufacturers.").noun_chunks:
    print(ele)

Autonomous cars
insurance liability
manufacturers


In [4]:
def parallel_apply_list(a_list, a_function, n_jobs=mp.cpu_count(), func_param=None, n_threads=None, **kwargs):
    """
    Applies a_function to a_list using multiprocessing with n_jobs. If a_function has a specific
    parameter that elements in a_list should fill, indicate it with func_param. If there are
    other parameters in a_function that should be statically filled, use **kwargs.

    If elements in a_list are tuples, lists, or anything else that provides multiple inputs
    to a_function, wrap a_function so that it takes the entire tuple or list (see parallel_apply_row for example)

    Parameters
    ----------
    a_list : list
    a_function : function object
    n_jobs : int (multiprocessing)
    n_threads : int (threading)
    func_param : None (defaults to first parameter in function) or str (parameter in a_function)
    kwargs : static keyword arguments to be given to all instances of a_function

    Returns
    -------
    result : list
    """
    if n_jobs:
        executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    else:
        executor = Parallel(n_jobs=n_threads, backend="threading", prefer="threads")
    do = delayed(partial(a_function, **kwargs))
    if func_param:
        tasks = (do(**{func_param: ele}) for ele in tqdm(a_list))
    else:
        tasks = (do(ele) for ele in tqdm(a_list))
    result = executor(tasks)
    return result

def spacy_norm(text):
    doc = nlp(text)
    tokenized_doc = [ele.text.lower() for ele in doc if (not ele.is_space) and (not ele.is_punct)]
    result = ' '.join(tokenized_doc) + '\n'
    return result
def process_save_norm(texts, data_name, data_type, current_path):
    norm_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_norm.txt"
    if os.path.exists(norm_outpath):
        norm_outpath_write = 'a'
    else:
        norm_outpath_write = 'w'
        
    os.makedirs(os.path.dirname(norm_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_norm)
    f0 = open(norm_outpath, norm_outpath_write)
    f0.writelines(results)
    f0.close()

def spacy_lemma(text):
    doc = nlp(text)
    results = [ele.lemma_.lower() for ele in doc if (not ele.is_space) and (not ele.is_punct)]
    results = ' '.join(results) + '\n'
    return results
def process_save_lemma(texts, data_name, data_type, current_path):
    lemma_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_lemma.txt"
    if os.path.exists(lemma_outpath):
        lemma_outpath_write = 'a'
    else:
        lemma_outpath_write = 'w'
    
    os.makedirs(os.path.dirname(lemma_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_lemma)
    f1 = open(lemma_outpath, lemma_outpath_write)
    f1.writelines(results)
    f1.close()       

def spacy_no_stop(text):
    doc = nlp(text)
    results = [ele.text.lower() for ele in doc if ((not ele.is_space) and (not ele.is_punct) and (not ele.is_stop))]
    results = ' '.join(results) + '\n'
    return results
def process_save_no_stop(texts, data_name, data_type, current_path):
    no_stop_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_no_stop.txt"
    if os.path.exists(no_stop_outpath):
        no_stop_outpath_write = 'a'
    else:
        no_stop_outpath_write = 'w'
        
    os.makedirs(os.path.dirname(no_stop_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_no_stop)
    f2 = open(no_stop_outpath, no_stop_outpath_write)
    f2.writelines(results)
    f2.close()
    
def spacy_lemma_no_stop(text):
    doc = nlp(text)
    results = [ele.lemma_.lower() for ele in doc if ((not ele.is_space) and (not ele.is_punct) and (not ele.is_stop))]
    results = ' '.join(results) + '\n'
    return results
def process_save_lemma_no_stop(texts, data_name, data_type, current_path):
    lemma_no_stop_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_lemma_no_stop.txt"
    if os.path.exists(lemma_no_stop_outpath):
        lemma_no_stop_outpath_write = 'a'
    else:
        lemma_no_stop_outpath_write = 'w'
    
    os.makedirs(os.path.dirname(lemma_no_stop_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_lemma_no_stop)
    f3 = open(lemma_no_stop_outpath, lemma_no_stop_outpath_write)
    f3.writelines(results)
    f3.close()

def spacy_np_lemma(text):
    doc = nlp(text.replace('\n', ' '))
    for np in doc.noun_chunks:
        while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
            np = np[1:]
        if len(np) > 1:
            with doc.retokenize() as retokenizer:
                doc.vocab['_'.join([ele.lemma_.lower() for ele in np])]
                retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.lemma_.lower() for ele in np])]})
        for ent in doc.ents:
            if len(ent) > 1:
                with doc.retokenize() as retokenizer:
                    doc.vocab['_'.join([ele.lemma_.lower() for ele in np])]
                    retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.lemma_.lower() for ele in np])]})

    tokenized_doc = [ele.lemma_.lower().replace(' ', '_') for ele in doc if ((not ele.is_space) and (not ele.is_punct))]
    tokenized_doc = [ele for ele in tokenized_doc if ele]
    return tokenized_doc
def extract_noun_phrases_from_tok_list(a_list_of_toks):
    results = [ele for ele in a_list_of_toks if '_' in ele]
    return results
def join_list_of_strings_add_newline(a_list_of_toks):
    result = ' '.join(a_list_of_toks) + '\n'
    return result
def process_save_np_lemma(texts, data_name, data_type, current_path):
    np_lemma_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_np_lemma.txt"
    if os.path.exists(np_lemma_outpath):
        np_lemma_outpath_write = 'a'
    else:
        np_lemma_outpath_write = 'w'
    
    os.makedirs(os.path.dirname(np_lemma_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_np_lemma)
    result1 = parallel_apply_list(results, join_list_of_strings_add_newline) 
    f5 = open(np_lemma_outpath, np_lemma_outpath_write)
    f5.writelines(result1)
    f5.close()
    
    np_lemma_outpath_only = f"{current_path}/../data/processed/{data_name}/{data_type}_np_lemma_only.txt"
    if os.path.exists(np_lemma_outpath_only):
        np_lemma_outpath_write_only = 'a'
    else:
        np_lemma_outpath_write_only = 'w'
    
    os.makedirs(os.path.dirname(np_lemma_outpath_only), exist_ok=True)
    result2 = parallel_apply_list(results, extract_noun_phrases_from_tok_list)
    result2 = [' '.join(ele) + '\n' for ele in result2]
    f8 = open(np_lemma_outpath_only, np_lemma_outpath_write_only)
    f8.writelines(result2)
    f8.close()
    
def spacy_np_no_stop(text):
    doc = nlp(text.replace('\n', ' '))
    for np in doc.noun_chunks:
        while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
            np = np[1:]
        if len(np) > 1:
            with doc.retokenize() as retokenizer:
                doc.vocab['_'.join([ele.text.lower() for ele in np])]
                retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.text.lower() for ele in np])]})
        for ent in doc.ents:
            if len(ent) > 1:
                with doc.retokenize() as retokenizer:
                    doc.vocab['_'.join([ele.text.lower() for ele in np])]
                    retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.text.lower() for ele in np])]})

    tokenized_doc = [ele.text.lower().strip().replace(' ', '_') for ele in doc if ((not ele.is_stop) and (not ele.is_space) and (not ele.is_punct))]
    tokenized_doc = [ele for ele in tokenized_doc if ele]
    return tokenized_doc
def process_save_np_no_stop(texts, data_name, data_type, current_path):
    np_no_stop_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_np_no_stop.txt"
    if os.path.exists(np_no_stop_outpath):
        np_no_stop_outpath_write = 'a'
    else:
        np_no_stop_outpath_write = 'w'
        
    os.makedirs(os.path.dirname(np_no_stop_outpath), exist_ok=True)
    start_time = time.time()
    results = parallel_apply_list(texts, spacy_np_no_stop)
    print(f"        Finished spacy_np_no_stop, taking {int(time.time() - start_time)} seconds...")
    start_time = time.time()
    result1 = parallel_apply_list(results, join_list_of_strings_add_newline) 
    print(f"        Finished join_list_of_strings_add_newline, taking {int(time.time() - start_time)} seconds...")
    start_time = time.time()
    f4 = open(np_no_stop_outpath, np_no_stop_outpath_write)
    f4.writelines(result1)
    f4.close()
    print(f"        Finished writing to file, taking {int(time.time() - start_time)} seconds...")
    
    np_no_stop_outpath_only = f"{current_path}/../data/processed/{data_name}/{data_type}_np_no_stop_only.txt"
    if os.path.exists(np_no_stop_outpath_only):
        np_no_stop_outpath_write_only = 'a'
    else:
        np_no_stop_outpath_write_only = 'w'
        
    os.makedirs(os.path.dirname(np_no_stop_outpath_only), exist_ok=True)
    result2 = parallel_apply_list(results, extract_noun_phrases_from_tok_list)
    result2 = [' '.join(ele) + '\n' for ele in result2]
    f7 = open(np_no_stop_outpath_only, np_no_stop_outpath_write_only)
    f7.writelines(result2)
    f7.close()
     
def spacy_np_lemma_no_stop(text):
    doc = nlp(text.replace('\n', ' '))
    for np in doc.noun_chunks:
        while len(np) > 1 and np[0].dep_ not in ('amod', 'compound'):
            np = np[1:]
        if len(np) > 1:
            with doc.retokenize() as retokenizer:
                doc.vocab['_'.join([ele.lemma_.lower() for ele in np])]
                retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.lemma_.lower() for ele in np])]})
        for ent in doc.ents:
            if len(ent) > 1:
                with doc.retokenize() as retokenizer:
                    doc.vocab['_'.join([ele.lemma_.lower() for ele in np])]
                    retokenizer.merge(np, attrs={LEMMA: doc.vocab.strings['_'.join([ele.lemma_.lower() for ele in np])]})

    tokenized_doc = [ele.lemma_.lower().replace(' ', '_') for ele in doc if ((not ele.is_stop) and (not ele.is_space) and (not ele.is_punct))]
    tokenized_doc = [ele for ele in tokenized_doc if ele]
    tokenized_doc = ' '.join(tokenized_doc) + '\n'
    return tokenized_doc
def process_save_np_lemma_no_stop(texts, data_name, data_type, current_path):
    np_lemma_no_stop_outpath = f"{current_path}/../data/processed/{data_name}/{data_type}_np_lemma_no_stop.txt"
    if os.path.exists(np_lemma_no_stop_outpath):
        np_lemma_no_stop_outpath_write = 'a'
    else:
        np_lemma_no_stop_outpath_write = 'w'
    
    os.makedirs(os.path.dirname(np_lemma_no_stop_outpath), exist_ok=True)
    results = parallel_apply_list(texts, spacy_np_lemma_no_stop)  
    f6 = open(np_lemma_no_stop_outpath, np_lemma_no_stop_outpath_write)           
    f6.writelines(results)
    f6.close()
    
    
def fully_process_partitions(partition, data_name='20_newgroups', data_type='train', current_path=current_path):
    print("    Processing norm...")
    process_save_norm(texts=partition,
                      data_name=data_name,
                      data_type=data_type,
                      current_path=current_path)
    print("    Processing lemma...")
    process_save_lemma(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)
    print("    Processing no_stop...")
    process_save_no_stop(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)
    print("    Processing lemma_no_stop...")
    process_save_lemma_no_stop(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)
    print("    Processing np_lemma...")
    process_save_np_lemma(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)
    print("    Processing np_no_stop...")
    process_save_np_no_stop(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)
    print("    Processing np_lemma_no_stop...")
    process_save_np_lemma_no_stop(texts=partition,
                       data_name=data_name,
                       data_type=data_type,
                       current_path=current_path)

In [3]:
newgroups_train_data = pickle.load(open(newsgroups_train_data_loc, "rb"))
newgroups_test_data = pickle.load(open(newsgroups_test_data_loc, "rb"))

newgroups_train_data = [ele if len(ele) > 999998 else ele[:999998] for ele in newgroups_train_data]
newgroups_test_data = [ele if len(ele) > 999998 else ele[:999998] for ele in newgroups_test_data]

In [5]:
partitions = minibatch(newgroups_train_data, size=5000)

for i, partition in enumerate(partitions):
    print(f"Starting partition {i}")
    fully_process_partitions(partition, data_name='20_newgroups', data_type='train')

  0%|          | 0/5000 [00:00<?, ?it/s]

Starting partition 0
    Processing norm...


100%|██████████| 5000/5000 [00:32<00:00, 153.90it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma...


100%|██████████| 5000/5000 [00:34<00:00, 143.32it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing no_stop...


100%|██████████| 5000/5000 [00:37<00:00, 133.59it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma_no_stop...


100%|██████████| 5000/5000 [00:34<00:00, 143.95it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma...


100%|██████████| 5000/5000 [01:10<00:00, 70.61it/s] 
100%|██████████| 5000/5000 [00:00<00:00, 15535.58it/s]
100%|██████████| 5000/5000 [00:00<00:00, 15530.80it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_no_stop...


100%|██████████| 5000/5000 [01:02<00:00, 80.60it/s] 
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished spacy_np_no_stop, taking 102 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 21740.28it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished join_list_of_strings_add_newline, taking 0 seconds...
        Finished writing to file, taking 0 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 21880.78it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma_no_stop...


100%|██████████| 5000/5000 [01:02<00:00, 79.89it/s] 
  0%|          | 0/5000 [00:00<?, ?it/s]

Starting partition 1
    Processing norm...


100%|██████████| 5000/5000 [00:36<00:00, 137.12it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma...


100%|██████████| 5000/5000 [00:42<00:00, 117.18it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing no_stop...


100%|██████████| 5000/5000 [00:43<00:00, 114.80it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma_no_stop...


100%|██████████| 5000/5000 [00:41<00:00, 120.90it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma...


100%|██████████| 5000/5000 [01:19<00:00, 62.55it/s] 
100%|██████████| 5000/5000 [00:00<00:00, 17527.43it/s]
100%|██████████| 5000/5000 [00:00<00:00, 18944.50it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_no_stop...


100%|██████████| 5000/5000 [01:15<00:00, 65.99it/s] 
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished spacy_np_no_stop, taking 117 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 22615.73it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished join_list_of_strings_add_newline, taking 0 seconds...
        Finished writing to file, taking 0 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 23769.12it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma_no_stop...


100%|██████████| 5000/5000 [01:16<00:00, 65.70it/s] 
  0%|          | 0/1314 [00:00<?, ?it/s]

Starting partition 2
    Processing norm...


100%|██████████| 1314/1314 [00:08<00:00, 147.81it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing lemma...


100%|██████████| 1314/1314 [00:09<00:00, 131.54it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing no_stop...


100%|██████████| 1314/1314 [00:10<00:00, 123.98it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing lemma_no_stop...


100%|██████████| 1314/1314 [00:10<00:00, 121.71it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing np_lemma...


100%|██████████| 1314/1314 [00:10<00:00, 125.28it/s]
100%|██████████| 1314/1314 [00:00<00:00, 5881.07it/s]
100%|██████████| 1314/1314 [00:00<00:00, 6498.45it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing np_no_stop...


100%|██████████| 1314/1314 [00:10<00:00, 121.99it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

        Finished spacy_np_no_stop, taking 12 seconds...


100%|██████████| 1314/1314 [00:00<00:00, 7164.66it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

        Finished join_list_of_strings_add_newline, taking 0 seconds...
        Finished writing to file, taking 0 seconds...


100%|██████████| 1314/1314 [00:00<00:00, 7638.07it/s]
  0%|          | 0/1314 [00:00<?, ?it/s]

    Processing np_lemma_no_stop...


100%|██████████| 1314/1314 [00:10<00:00, 126.29it/s]


In [6]:
partitions = minibatch(newgroups_test_data, size=5000)

for i, partition in enumerate(partitions):
    print(f"Starting partition {i}")
    fully_process_partitions(partition, data_name='20_newgroups', data_type='test')

  0%|          | 0/5000 [00:00<?, ?it/s]

Starting partition 0
    Processing norm...


100%|██████████| 5000/5000 [00:38<00:00, 129.76it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma...


100%|██████████| 5000/5000 [00:40<00:00, 124.91it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing no_stop...


100%|██████████| 5000/5000 [00:39<00:00, 127.01it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing lemma_no_stop...


100%|██████████| 5000/5000 [00:40<00:00, 123.45it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma...


100%|██████████| 5000/5000 [01:14<00:00, 67.55it/s] 
100%|██████████| 5000/5000 [00:00<00:00, 17233.36it/s]
100%|██████████| 5000/5000 [00:00<00:00, 19073.98it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_no_stop...


100%|██████████| 5000/5000 [01:10<00:00, 70.82it/s] 
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished spacy_np_no_stop, taking 90 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 23343.55it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

        Finished join_list_of_strings_add_newline, taking 0 seconds...
        Finished writing to file, taking 0 seconds...


100%|██████████| 5000/5000 [00:00<00:00, 23624.13it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

    Processing np_lemma_no_stop...


100%|██████████| 5000/5000 [01:08<00:00, 72.60it/s] 
  0%|          | 0/2532 [00:00<?, ?it/s]

Starting partition 1
    Processing norm...


100%|██████████| 2532/2532 [00:17<00:00, 148.38it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing lemma...


100%|██████████| 2532/2532 [00:20<00:00, 122.11it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing no_stop...


100%|██████████| 2532/2532 [00:20<00:00, 122.17it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing lemma_no_stop...


100%|██████████| 2532/2532 [00:21<00:00, 116.97it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing np_lemma...


100%|██████████| 2532/2532 [00:24<00:00, 101.84it/s]
100%|██████████| 2532/2532 [00:00<00:00, 12612.46it/s]
100%|██████████| 2532/2532 [00:00<00:00, 12728.92it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing np_no_stop...


100%|██████████| 2532/2532 [00:19<00:00, 129.93it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

        Finished spacy_np_no_stop, taking 40 seconds...


100%|██████████| 2532/2532 [00:00<00:00, 13625.51it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

        Finished join_list_of_strings_add_newline, taking 0 seconds...
        Finished writing to file, taking 0 seconds...


100%|██████████| 2532/2532 [00:00<00:00, 14923.10it/s]
  0%|          | 0/2532 [00:00<?, ?it/s]

    Processing np_lemma_no_stop...


100%|██████████| 2532/2532 [00:19<00:00, 133.07it/s]
