# Goal: Investigate birth and death among closed classes of words
Load and Pre-Process   
1. Load all the gzipped ngrams data into the notebook  
2. Pre-process Google Ngrams database so that there is only the following [Google Tags](https://books.google.com/ngrams/info)
    - _PRON_	pronoun
    - _DET_	determiner or article
    - _ADP_	an adposition: either a preposition or a postposition
    - _CONJ_	conjunction
    - _PRT_	particle  
    (These tags can only be appended to a word (she_PRON), no stand alones (\_PRON\_))
3. Save as JSON file  

## Load and pre-process 

In [1]:
import os
import gzip
import re
import json
from tqdm import tqdm
#for unigram_tests filtering (words that represent phonemes which are not in the English set are not allowed)
from unidecode import unidecode
#For the Google POS tagging
underscore = re.compile('_{1}')

In [2]:
import string
PUNCTUATION = set(char for char in string.punctuation).union({'“','”'})
DIGITS = set(string.digits)
VOWELS = set("aeiouyAEIOUY")
#Excluding '_' (underscore) from DASHES precludes the tagged 1grams "_NOUN", add it to also include the tagged 1grams
DASHES = {'—','–','—','―','‒','-','_'}
PUNCTUATION.difference_update(DASHES)
STOPS = PUNCTUATION.union(DIGITS)
GOOGLE_TAGS = {'PRON','DET','ADP','CONJ','PRT'}

In [3]:
def open_gzip(directory,file_path):
    with gzip.open(directory+file_path,'r') as f_in:
        #Each row is in form: ngram TAB year, match_count, volume_count NEWLINE
        return [x.decode('utf8').strip() for x in f_in.readlines()]

In [4]:
def csv2tuple(string):
    year,match_count,volume_count = tuple(string.split(','))
    return int(year),int(match_count),int(volume_count)

In [5]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [6]:
def unigram_tests(unigram):
    #Checks each character in the unigram against the characters in the STOP set. (character level filtering) - no punctuation or digits allowed
    if set(unigram).intersection(STOPS):
        return False
    
    #must have a vowel (presupposes that it must also have a letter of the alphabet inside)
    if not set(unigram).intersection(VOWELS):
        return False #Rewrite the alphabet one, i think this is better
    
    #Words cannot start or end with dashes
    if unigram[0] in DASHES or unigram[-1] in DASHES:
        return False
    
    #Exclude words with more than one underscore, can make this != to only include tagged words
    if len(underscore.findall(unigram))>1:
        return False
    
    #must have 0 non-english letters
    test = unidecode(unigram, errors='replace')
    if test != unigram:
        return False
    
    #Can implement more tests here if you need to do more filtering
    
    else:
        return True

In [7]:
def preprocess_ngrams(directory,file_path):
    
    rows = open_gzip(directory,file_path)
    ngram_dict = dict()

    #This implementation outputs {1gram:{year:match_count ...} ...}
    #Each row is in form: ngram TAB year, match_count, volume_count TAB year, match_count, volume_count ... NEWLINE
    for row in tqdm(rows):
        #Thus we will first create columns in every row by splitting by the tabs
        columns = row.split('\t')
        #unigram is the first entry, the rest of the entries are of the form (year,match_count,volume_count\t) n times, where n is variable each line
        unigram = columns[0]
        if len(underscore.findall(unigram))==1: #One and only one underscore allowed
            word_tag = underscore.split(unigram) # list of [word,tag]
            #checks if tag is Google tag
            if word_tag[1] in GOOGLE_TAGS:
                #Removes the tag before processing unigram string
                unigram = word_tag[0].lower().strip()+'_'+word_tag[1]
                #This is the final check for lexemes
                if unigram_tests(unigram):
                    #Parse the new entry and create a dictionary of records in form {year:match_count}
                    records = dict()
                    #the first entry in columns is word so that is exluded
                    for entry in columns[1:]:
                        year,match_count,volume_count = csv2tuple(str(entry))
                        if year>1800 and volume_count>1:
                            records[year] = match_count

                    #Modify the dictionary if new entry is already there, else just add it as a new unigram:records to the dict
                    if unigram in ngram_dict.keys():
                        #accessing the ngram dictionary and seeing if each year is present, if so add match count, else add a new record entry to the dictionary.
                        for yr, match_ct in records.items(): #each record should be of the form {year, match_count}
                            #If the year in the new record is in the dict for this 1gram, then find where it is.
                            if yr in ngram_dict[unigram].keys():
                                ngram_dict[unigram][yr] += match_ct
                            else:
                                #This just adds the record to the end, will need to sort later
                                ngram_dict[unigram][yr] = match_ct
                    else:
                        ngram_dict[unigram] = records
    return ngram_dict

## Run Everything

In [8]:
%%time
directory = '../Ngrams/unigram_data/'
files = os.listdir(directory)
for file_path in files:
    if '.gz' in file_path:
        ngram_dict = preprocess_ngrams(directory,file_path)
        save_json(ngram_dict,directory,file_path[:-3]+'_CLOSED_CLASSES')
        del ngram_dict #This allows the next loop to start with the original amount of memory, not twice the amount

100%|████████████████████████████████████████████████████████████████████| 2396510/2396510 [00:09<00:00, 243740.11it/s]


unigram dict empty 1-00000-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3345476/3345476 [00:16<00:00, 207763.86it/s]


unigram dict empty 1-00001-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3315859/3315859 [00:14<00:00, 221132.15it/s]


unigram dict empty 1-00002-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3081673/3081673 [00:14<00:00, 215393.55it/s]


unigram dict empty 1-00003-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3377697/3377697 [00:15<00:00, 215747.88it/s]


unigram dict empty 1-00004-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3375570/3375570 [00:15<00:00, 220236.66it/s]


unigram dict empty 1-00005-of-00024_CLOSED_CLASSES.json


100%|████████████████████████████████████████████████████████████████████| 3103866/3103866 [00:17<00:00, 180476.58it/s]


SAVED:  1-00006-of-00024_CLOSED_CLASSES.json 16626


100%|████████████████████████████████████████████████████████████████████| 3384843/3384843 [00:20<00:00, 167463.95it/s]


SAVED:  1-00007-of-00024_CLOSED_CLASSES.json 9092


100%|████████████████████████████████████████████████████████████████████| 3408143/3408143 [00:19<00:00, 172041.88it/s]


SAVED:  1-00008-of-00024_CLOSED_CLASSES.json 2891


100%|████████████████████████████████████████████████████████████████████| 3109631/3109631 [00:17<00:00, 179396.84it/s]


SAVED:  1-00009-of-00024_CLOSED_CLASSES.json 4368


100%|████████████████████████████████████████████████████████████████████| 3365531/3365531 [00:20<00:00, 167540.74it/s]


SAVED:  1-00010-of-00024_CLOSED_CLASSES.json 12317


100%|████████████████████████████████████████████████████████████████████| 3386487/3386487 [00:19<00:00, 172723.24it/s]


SAVED:  1-00011-of-00024_CLOSED_CLASSES.json 4995


100%|████████████████████████████████████████████████████████████████████| 3135145/3135145 [00:19<00:00, 163792.52it/s]


SAVED:  1-00012-of-00024_CLOSED_CLASSES.json 12206


100%|████████████████████████████████████████████████████████████████████| 3402459/3402459 [00:19<00:00, 177097.24it/s]


SAVED:  1-00013-of-00024_CLOSED_CLASSES.json 3739


100%|████████████████████████████████████████████████████████████████████| 3392643/3392643 [00:20<00:00, 164729.53it/s]


SAVED:  1-00014-of-00024_CLOSED_CLASSES.json 15740


100%|████████████████████████████████████████████████████████████████████| 3100658/3100658 [00:19<00:00, 161272.91it/s]


SAVED:  1-00015-of-00024_CLOSED_CLASSES.json 13055


100%|████████████████████████████████████████████████████████████████████| 3375649/3375649 [00:30<00:00, 110347.49it/s]


SAVED:  1-00016-of-00024_CLOSED_CLASSES.json 123093


100%|████████████████████████████████████████████████████████████████████| 3403219/3403219 [00:21<00:00, 154764.93it/s]


SAVED:  1-00017-of-00024_CLOSED_CLASSES.json 20180


100%|████████████████████████████████████████████████████████████████████| 3118263/3118263 [00:19<00:00, 156095.47it/s]


SAVED:  1-00018-of-00024_CLOSED_CLASSES.json 36568


100%|████████████████████████████████████████████████████████████████████| 3384057/3384057 [00:21<00:00, 156585.45it/s]


SAVED:  1-00019-of-00024_CLOSED_CLASSES.json 72093


100%|████████████████████████████████████████████████████████████████████| 3427775/3427775 [00:21<00:00, 159520.42it/s]


SAVED:  1-00020-of-00024_CLOSED_CLASSES.json 77955


100%|████████████████████████████████████████████████████████████████████| 3071926/3071926 [00:17<00:00, 173226.80it/s]


SAVED:  1-00021-of-00024_CLOSED_CLASSES.json 41723


100%|████████████████████████████████████████████████████████████████████| 3467821/3467821 [00:19<00:00, 175427.92it/s]


SAVED:  1-00022-of-00024_CLOSED_CLASSES.json 33454


100%|████████████████████████████████████████████████████████████████████| 4149670/4149670 [00:30<00:00, 137671.85it/s]


SAVED:  1-00023-of-00024_CLOSED_CLASSES.json 122872
Wall time: 17min 52s
