In [1]:
%%capture

%run ../path_manager.ipynb

In [2]:
import pandas as pd
import numpy as np
import glob
from gensim.models.phrases import Phrases, Phraser
from collections import Counter
import pickle
from datetime import datetime
from multiprocessing import Pool

In [3]:
def notify(fname, message):
    message = f'{datetime.now()}: {message}'
    with open(fname, 'a+') as fl:
        fl.write(message + '\n')
        
    print(message)

In [4]:
file_map = {}


def file_generator(cleaned_files):
    for ix, f in enumerate(cleaned_files):
        if ix and (ix % 1000) == 0:
            notify('run_log.txt', f'Processed {ix} documents...')
            
        if f not in file_map:
            with open(f) as fl:
                file_map[f] = fl.read().split()
        yield file_map[f]


def phraser_generator(cleaned_files, phrasers=None):
    
    if phrasers is None:
        phrasers = []
        
    corpora = file_generator(cleaned_files=cleaned_files)
    for corpus in corpora:
        for phraser in phrasers:
            corpus = phraser[corpus]
        yield corpus


def train_phrasers(cleaned_files, threshold=0.5, scoring='npmi'):
    '''
    Empirical test shows that npmi with threshold of 0.5 works better for batches of 10k-15k.
    '''
    phrasers = []

    notify('run_log.txt', 'Start generating bigrams...')

    sentences = phraser_generator(cleaned_files=cleaned_files, phrasers=phrasers)
    bigram_phrases = Phrases(sentences, threshold=threshold, scoring=scoring)  # train model
    bigram_model = Phraser(bigram_phrases)
    phrasers.append(bigram_model)

    notify('run_log.txt', 'Start generating trigrams...')
    sentences = phraser_generator(cleaned_files=cleaned_files, phrasers=phrasers)
    trigram_phrases = Phrases(sentences, threshold=threshold, scoring=scoring)  # train model
    trigram_model = Phraser(trigram_phrases)
    phrasers.append(trigram_model)

    # print('Start generating 4-grams...')
    # sentences = phraser_generator(clean_file_dir=clean_file_dir, phrasers=phrasers)
    # four_gram_phrases = Phrases(sentences, min_count=1, threshold=10)  # threshold=0.5, scoring='npmi')  # train model
    # four_gram_model = Phraser(four_gram_phrases)
    # phrasers.append(four_gram_model)

    # print('Start generating 5-grams...')
    # sentences = phraser_generator(clean_file_dir=clean_file_dir, phrasers=phrasers)
    # five_gram_phrases = Phrases(sentences, min_count=1, threshold=10)  # threshold=0.5, scoring='npmi')  # train model
    # five_gram_model = Phraser(five_gram_phrases)
    # phrasers.append(five_gram_model)
    
    return phrasers


def collect_ngrams(cleaned_files, phrasers, filename=None):
    ngrams = Counter()

    for ngram_corpus in phraser_generator(cleaned_files=cleaned_files, phrasers=phrasers):    
        ngrams.update(set([i for i in ngram_corpus if '_' in i]))

    ngram_freq = pd.DataFrame(ngrams.most_common(), columns=['ngram', 'occ'])

    if filename is not None:
        ngram_freq.to_csv(filename, index=False)
    
    return ngram_freq

In [None]:
# CORPUS_ID = 'WB'
# NUM_SAMPLES = 10000
# all_cleaned_files = glob.glob(os.path.join(get_txt_clean_path(CORPUS_ID), '*.txt'))
# phrasers_map = {}

# all_ngram_freqs = []

# for ix in range(2):  # 30):
#     print(f"Starting with sample {ix + 1}...")
    
#     np.random.seed(ix + 1)
#     cleaned_files = np.random.choice(all_cleaned_files, NUM_SAMPLES, replace=False)
#     _phrasers = train_phrasers(cleaned_files)
#     phrasers_map[ix] = _phrasers
    
#     ngram_freq = collect_ngrams(
#         cleaned_files, _phrasers,
#         filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_{ix}.csv')
#     )
    
#     all_ngram_freqs.append(ngram_freq.set_index('ngram'))
    
# all_ngram_freqs_df = pd.concat(all_ngram_freqs, axis=1).sum(axis=1)

# all_ngram_freqs_df.index.name = 'ngram'
# all_ngram_freqs_df.name = 'occ'
# all_ngram_freqs_df = all_ngram_freqs_df.reset_index()

# all_ngram_freqs_df = all_ngram_freqs_df.sort_values('occ', ascending=False).reset_index(drop='index')
# # all_ngram_freqs_df.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_all.csv'), index=False)

In [None]:
THRESHOLD = 0.2
SCORING = 'npmi'

PHRASER_ID = f'{SCORING}{THRESHOLD}'
try:
    CORPUS_ID_SAMPLES = {
        'IMF': 20000,
        'WB': 20000,
    }

    all_truncated_ngrams = []

    for CORPUS_ID, NUM_SAMPLES in CORPUS_ID_SAMPLES.items():
        notify('run_log.txt', f'Generating ngrams for {CORPUS_ID}...')
        all_cleaned_files = glob.glob(os.path.join(get_txt_clean_path(CORPUS_ID), '*.txt'))
        freq_cutoff = min(500, 0.01 * len(all_cleaned_files))  # Cutoff set to minimize noise from ngrams of rare occurrence.
        print(f'freq_cutoff: {freq_cutoff}')

        np.random.shuffle(all_cleaned_files)
        cleaned_files_partition = np.array_split(all_cleaned_files, max(len(all_cleaned_files) // NUM_SAMPLES, 1)) # if len(all_cleaned_files) is less than twice NUM_SAMPLES, it will return a single list containing all the files.

        phrasers_map = {}
        all_ngram_freqs = []

        for ix in range(len(cleaned_files_partition)):
            notify('run_log.txt', f"Starting with sample {ix + 1}...")

            # np.random.seed(ix + 1)
            # cleaned_files = np.random.choice(all_cleaned_files, NUM_SAMPLES, replace=False)

            cleaned_files = cleaned_files_partition[ix]
            _phrasers = train_phrasers(cleaned_files, threshold=THRESHOLD, scoring=SCORING)
            phrasers_map[ix] = _phrasers

            ngram_freq = collect_ngrams(
                cleaned_files, _phrasers,
                filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_{ix}.csv')
            )

            all_ngram_freqs.append(ngram_freq.set_index('ngram'))

        notify('run_log.txt', f'Concatenating {len(all_ngram_freqs)} partitions...')
        all_ngram_freqs_df = pd.concat(all_ngram_freqs, axis=1).sum(axis=1)

        all_ngram_freqs_df.index.name = 'ngram'
        all_ngram_freqs_df.name = 'occ'
        all_ngram_freqs_df = all_ngram_freqs_df.reset_index()

        all_ngram_freqs_df = all_ngram_freqs_df.sort_values('occ', ascending=False).reset_index(drop='index')
        
        notify('run_log.txt', f'Saving all {all_ngram_freqs_df.shape[0]} ngrams...')
        all_ngram_freqs_df.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_all-{PHRASER_ID}-{NUM_SAMPLES}.csv'), index=False)

        notify('run_log.txt', f'Truncating ngrams to minimum of {freq_cutoff} occurrences...')
        truncated_ngram_freqs_df = all_ngram_freqs_df[all_ngram_freqs_df['occ'] >= freq_cutoff].copy()
        
        notify('run_log.txt', f'Saving all {truncated_ngram_freqs_df.shape[0]} truncated ngrams...')
        truncated_ngram_freqs_df.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_truncated-{PHRASER_ID}-{NUM_SAMPLES}.csv'), index=False)

        all_truncated_ngrams.append(truncated_ngram_freqs_df)

    notify('run_log.txt', f'Combining all {len(all_truncated_ngrams)} corpus ngrams...')
    all_truncated_ngrams_df = pd.concat(all_truncated_ngrams, axis=0)
    all_truncated_ngrams_df = all_truncated_ngrams_df.groupby('ngram')['occ'].sum().reset_index().sort_values('occ', ascending=False)
    
    notify('run_log.txt', f'Saving combined truncated ngrams...')
    all_truncated_ngrams_df.to_csv('../whitelists/truncated_ngrams.csv', index=False)
    notify('run_success.txt', 'completed successfully!')
    
except Exception as e:
    notify('run_failed.txt', e)

2019-11-01 23:15:04.879418: Generating ngrams for IMF...
freq_cutoff: 131.68
2019-11-01 23:15:04.913399: Starting with sample 1...
2019-11-01 23:15:04.913669: Start generating bigrams...
2019-11-01 23:15:17.988119: Processed 1000 documents...
2019-11-01 23:15:32.398013: Processed 2000 documents...
2019-11-01 23:15:46.023481: Processed 3000 documents...
2019-11-01 23:15:59.407703: Processed 4000 documents...
2019-11-01 23:16:12.655955: Processed 5000 documents...
2019-11-01 23:16:26.805160: Processed 6000 documents...
2019-11-01 23:16:40.996419: Processed 7000 documents...
2019-11-01 23:16:56.845262: Processed 8000 documents...
2019-11-01 23:17:10.408503: Processed 9000 documents...
2019-11-01 23:17:24.174271: Processed 10000 documents...
2019-11-01 23:17:38.544196: Processed 11000 documents...
2019-11-01 23:17:52.147334: Processed 12000 documents...
2019-11-01 23:18:05.758124: Processed 13000 documents...
2019-11-01 23:19:07.530390: Start generating trigrams...
2019-11-01 23:19:28.8515

In [None]:
all_truncated_ngrams_df[all_truncated_ngrams_df.ngram == 'consumer_price_index']

In [None]:
all_truncated_ngrams_df.head()

In [None]:
# try:
#     CORPUS_ID_SAMPLES = {
#         'WB': 10000,
#         'IMF': 10000,
#     }

#     all_truncated_ngrams = []

#     for CORPUS_ID, NUM_SAMPLES in CORPUS_ID_SAMPLES.items():
#         notify('run_log.txt', f'Generating ngrams for {CORPUS_ID}...')
#         all_cleaned_files = glob.glob(os.path.join(get_txt_clean_path(CORPUS_ID), '*.txt'))
#         freq_cutoff = min(500, 0.01 * len(all_cleaned_files))  # Cutoff set to minimize noise from ngrams of rare occurrence.

#         np.random.shuffle(all_cleaned_files)
#         cleaned_files_partition = np.array_split(all_cleaned_files, len(all_cleaned_files) // NUM_SAMPLES) # if len(all_cleaned_files) is less than twice NUM_SAMPLES, it will return a single list containing all the files.

#         phrasers_map = {}
#         all_ngram_freqs = []

#         for ix in range(len(cleaned_files_partition)):
#             notify('run_log.txt', f"Starting with sample {ix + 1}...")

#             # np.random.seed(ix + 1)
#             # cleaned_files = np.random.choice(all_cleaned_files, NUM_SAMPLES, replace=False)

#             cleaned_files = cleaned_files_partition[ix]
#             _phrasers = train_phrasers(cleaned_files)
#             phrasers_map[ix] = _phrasers

#             ngram_freq = collect_ngrams(
#                 cleaned_files, _phrasers,
#                 filename=os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_{ix}.csv')
#             )

#             all_ngram_freqs.append(ngram_freq.set_index('ngram'))

#         notify('run_log.txt', f'Concatenating {len(all_ngram_freqs)} partitions...')
#         all_ngram_freqs_df = pd.concat(all_ngram_freqs, axis=1).sum(axis=1)

#         all_ngram_freqs_df.index.name = 'ngram'
#         all_ngram_freqs_df.name = 'occ'
#         all_ngram_freqs_df = all_ngram_freqs_df.reset_index()

#         all_ngram_freqs_df = all_ngram_freqs_df.sort_values('occ', ascending=False).reset_index(drop='index')
        
#         notify('run_log.txt', f'Saving all {all_ngram_freqs_df.shape[0]} ngrams...')
#         all_ngram_freqs_df.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_all.csv'), index=False)

#         notify('run_log.txt', f'Truncating ngrams to minimum of {freq_cutoff} occurrences...')
#         truncated_ngram_freqs_df = all_ngram_freqs_df[all_ngram_freqs_df['occ'] >= freq_cutoff].copy()
        
#         notify('run_log.txt', f'Saving all {truncated_ngram_freqs_df.shape[0]} truncated ngrams...')
#         truncated_ngram_freqs_df.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams-random_truncated.csv'), index=False)

#         all_truncated_ngrams.append(truncated_ngram_freqs_df)

#     notify('run_log.txt', f'Combining all {len(all_truncated_ngrams)} corpus ngrams...')
#     all_truncated_ngrams_df = pd.concat(all_truncated_ngrams, axis=0)
#     all_truncated_ngrams_df = all_truncated_ngrams_df.groupby('ngram')['occ'].sum().reset_index().sort_values('occ', ascending=False)
    
#     notify('run_log.txt', f'Saving combined truncated ngrams...')
#     all_truncated_ngrams_df.to_csv('../whitelists/truncated_ngrams.csv', index=False)
#     notify('run_success.txt', 'completed successfully!')
    
# except Exception as e:
#     notify('run_failed.txt', e)

# Process `ngrams` to create a cleaned version of each 

In [None]:
%%capture

# Cleaner
%run ../DataCleanerModule.ipynb

In [None]:
notify('run_log.txt', f'Setting up cleaner...')

cleaner = Cleaner(
    use_spellchecker=True, use_respeller=True, use_lemmatizer=True, use_spacy=True,
    replacements_plurals_to_singular_file='../whitelists/whitelist_replacements_plurals_to_singular.csv',
    acronyms_file='../whitelists/whitelist_acronyms.csv',
    num_workers=22,
    ignore_length=0,
#     supported_lang=('en', 'it', 'ro', 'fr'),
    check_language=False,
)

In [None]:
notify('run_log.txt', f'Loading truncated_ngrams.csv ...')
# ngrams_df = pd.read_csv('../whitelists/truncated_ngrams.csv', header=None, index_col=False)
ngrams_df = pd.read_csv('../whitelists/truncated_ngrams.csv', index_col=False)

In [None]:
ngrams_df.shape

In [None]:
ngrams_df.head()

In [None]:
%%time
cleaner.clean_text('world_bank')

In [None]:
def clean_ngram(ngram):
    return {ngram: cleaner.clean_text(ngram)['text']}

In [None]:
WORKERS = max(1, os.cpu_count() - 4)

In [None]:
notify('run_log.txt', f'Running cleaner on ngrams...')

if __name__ == '__main__':
    p = Pool(WORKERS)
    res = p.map(clean_ngram, ngrams_df['ngram'])
    ngram_cleaned_map = {k:v for i in res for k, v in i.items()}

In [None]:
notify('run_log.txt', f'Processing dataframe...')
ngrams_df['cleaned'] = ngrams_df['ngram'].map(ngram_cleaned_map)
ngrams_df = ngrams_df.drop('occ', axis=1)

In [None]:
ngrams_df.head()

In [None]:
notify('run_log.txt', f'Saving cleaned ngrams to whitelist...')
ngrams_df.to_csv('../whitelists/whitelist_ngrams_truncated_cleaned.csv', index=False)

In [None]:
ngram_freq[ngram_freq.ngram=='']

In [None]:
ngram_freq[ngram_freq.ngram=='world_bank']

In [None]:
ngram_freq.head(2)

In [None]:
# ngram_freq.to_csv(os.path.join(get_corpus_path(CORPUS_ID), f'{CORPUS_ID.lower()}_ngrams_sample_wb_44.csv'), index=False)

In [None]:
ngram_freq['num_gram'] = ngram_freq.ngram.str.split('_').map(len)

In [None]:
ngram_freq[(ngram_freq.num_gram <= 5) & (ngram_freq.occ > 10)]

In [21]:
all_truncated_ngrams_df[all_truncated_ngrams_df.ngram == 'climate_change']

Unnamed: 0,ngram,occ
7062,climate_change,21087.0


In [46]:
all_truncated_ngrams_df[all_truncated_ngrams_df.ngram == 'climate_change']

Unnamed: 0,ngram,occ
674,climate_change,25961.0


In [45]:
all_truncated_ngrams_df.iloc[100:150]

Unnamed: 0,ngram,occ
2974,policy_maker,22090.0
2273,living_standard,21921.0
3093,primary_secondary,21897.0
431,board_director,21732.0
4413,vulnerable_group,21686.0
3811,solid_waste,21309.0
3294,real_estate,21260.0
4174,timely_manner,20946.0
1738,general_condition,20828.0
878,contract_award,20816.0


In [45]:
all_truncated_ngrams_df.iloc[100:150]

Unnamed: 0,ngram,occ
2974,policy_maker,22090.0
2273,living_standard,21921.0
3093,primary_secondary,21897.0
431,board_director,21732.0
4413,vulnerable_group,21686.0
3811,solid_waste,21309.0
3294,real_estate,21260.0
4174,timely_manner,20946.0
1738,general_condition,20828.0
878,contract_award,20816.0
