# Goal: Investigate birth and death among closed classes of words
1. Load and Pre-Process  
    a. Load all the gzipped ngrams data into the notebook  
    b. Pre-process Google Ngrams database so that there is only the following [Google Tags](https://books.google.com/ngrams/info)
        - _PRON_	pronoun
        - _DET_	determiner or article
        - _ADP_	an adposition: either a preposition or a postposition
        - _CONJ_	conjunction
        - _PRT_	particle  
        (These tags can only be appended to a word (she_PRON), no stand alones (\_PRON\_))
    c. Save as JSON file  
~  
2. Investigate  
    a. Load in pre-processed JSON Files  
    b. Normalize the dictionary to have standard width  
    c. Smooth the data (rolling average)  
    d. Isolate years of word birth and death {unigram:\[year1, year2, ...\]   
    e. Save Birth and Death dictionaries separately as JSON files  

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

## (1) Load and pre-process 

In [2]:
import os
import gzip
import re
from unidecode import unidecode
#For the Google POS tagging
underscore = re.compile('_{1}')

In [3]:
import string
PUNCTUATION = set(char for char in string.punctuation).union({'“','”'})
DIGITS = set(string.digits)
VOWELS = set("aeiouyAEIOUY")
#Excluding '_' (underscore) from DASHES precludes the tagged 1grams "_NOUN", add it to also include the tagged 1grams
DASHES = {'—','–','—','―','‒','-','_'}
PUNCTUATION.difference_update(DASHES)
STOPS = PUNCTUATION.union(DIGITS)
GOOGLE_TAGS = {'PRON','DET','ADP','CONJ','PRT'}

In [4]:
def open_gzip(directory,file_path):
    with gzip.open(directory+file_path,'r') as f_in:
        rows = [x.decode('utf8').strip() for x in f_in.readlines()]
    return rows

In [5]:
def csv2tuple(string):
    year,match_count,volume_count = tuple(string.split(','))
    return int(year),int(match_count),int(volume_count)

In [6]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [7]:
def unigram_tests(unigram):
    #Checks each character in the unigram against the characters in the STOP set. (character level filtering) - no punctuation or digits allowed
    if set(unigram).intersection(STOPS):
        return False
    
    #must have a vowel (presupposes that it must also have a letter of the alphabet inside)
    if not set(unigram).intersection(VOWELS):
        return False #Rewrite the alphabet one, i think this is better
    
    #Words cannot start or end with dashes
    if unigram[0] in DASHES or unigram[-1] in DASHES:
        return False
    
    #Exclude words with more than one underscore, can make this != to only include tagged words
    if len(underscore.findall(unigram))>1:
        return False
    
    #must have 0 non-english letters
    test = unidecode(unigram, errors='replace')
    if test != unigram:
        return False
    
    #Can implement more tests here if you need to do more filtering
    
    else:
        return True

In [8]:
def preprocess_ngrams(directory,file_path):
    
    rows = open_gzip(directory,file_path)
    ngram_dict = dict()

    #This implementation uses {1gram:{year:match_count ...} ...}
    for row in tqdm(rows):
        columns = row.split('\t')
        #unigram is the first entry, the rest of the entries are of the form year,match_count,volume_count\t n times, where n is variable each line
        
        unigram = columns[0]
        if len(underscore.findall(unigram))==1: #One and only one underscore allowed
            word_tag = underscore.split(unigram) # list of [word,tag]
            #checks if tag is Google tag
            if word_tag[1] in GOOGLE_TAGS:
                #Removes the tag before processing unigram string
                unigram = word_tag[0].lower().strip()+'_'+word_tag[1]
                if unigram_tests(unigram):
                    #Parse the new entry and create a list of records in form [...[year, match_count]...]
                    records = dict()
                    #the first entry in columns is word so that is exluded
                    for entry in columns[1:]:
                        year,match_count,volume_count = csv2tuple(str(entry))
                        if year>1800 and volume_count>1:
                            records[year] = match_count

                    #Modify the dictionary if new entry is already there, else just add it as a new unigram:records to the dict
                    if unigram in ngram_dict.keys():
                        #accessing the ngram dictionary and seeing if each year is present, if so add match count, else add a new record entry to the dictionary.
                        for yr, match_ct in records.items(): #each record should be of the form {year, match_count}
                            #If the year in the new record is in the dict for this 1gram, then find where it is.
                            if yr in ngram_dict[unigram].keys():
                                ngram_dict[unigram][yr] += match_ct
                            else:
                                #This just adds the record to the end, will need to sort later
                                ngram_dict[unigram][yr] = match_ct
                    else:
                        ngram_dict[unigram] = records
    
    #Save as JSON
    save_json(ngram_dict,directory,file_path[:-3]+'_CLOSED_CLASSES')

## (2) Investigate

In [9]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [10]:
def normalize(ngrams):
    words = list(ngrams.keys())
    years = [str(i) for i in range(1800,2020)]
    unigram_dict = dict()
    for word in tqdm(words):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    
    return unigram_dict, years

In [11]:
def smoothing(unigram_dict, years,smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()
    years_map = {i:int(year) for i, year in enumerate(df.index)}
    ngrams = df.to_dict(orient = 'list')
    return ngrams, years_map

In [12]:
%%time
def birth_and_death(ngrams,years_map):
    birth, death = {},{}
    for unigram in tqdm(ngrams):
        l = ngrams[unigram]
        birth_years, death_years = [],[]
        for i in range(len(l)-1):
            #Birth
            if l[i]==0 and l[i+1]!=0:
                birth_years.append(years_map[i+1])
            #Death
            if l[i]!=0 and l[i+1]==0:
                death_years.append(years_map[i])

            #Disregarding death in the final year

        if len(birth_years)>0:
            birth[unigram] = birth_years

        if len(death_years)>0:
            death[unigram] = death_years

    print('Birth:',len(birth),'\nDeath:',len(death))
    return birth, death

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 7.87 µs


In [13]:
def investigate(directory,file_path):
    #birth, death = birth_and_death(smoothing(normalize(open_json(directory,file_path))))
    
    ngrams = open_json(directory,file_path)
    unigram_dict, years = normalize(ngrams)
    del ngrams
    ngrams, years_map = smoothing(unigram_dict, years)
    del unigram_dict
    del years
    birth, death = birth_and_death(ngrams,years_map)
    del ngrams
    del years_map
    save_json(birth,directory,file_path[:-5]+'_BIRTH')
    save_json(death,directory,file_path[:-5]+'_DEATH')

## Run Everything

In [14]:
directory = './Ngrams/'

In [15]:
%%time
files = os.listdir(directory)
for file_path in files:
    if '.gz' in file_path:
        preprocess_ngrams(directory,file_path)

100%|██████████| 2396510/2396510 [00:06<00:00, 350363.87it/s]


unigram dict empty 1-00000-of-00024_CLOSED_CLASSES.json


100%|██████████| 3100658/3100658 [00:11<00:00, 280323.98it/s]


SAVED:  1-00015-of-00024_CLOSED_CLASSES.json 13055


100%|██████████| 3109631/3109631 [00:10<00:00, 305724.03it/s]


SAVED:  1-00009-of-00024_CLOSED_CLASSES.json 4368


100%|██████████| 3365531/3365531 [00:11<00:00, 281013.15it/s]


SAVED:  1-00010-of-00024_CLOSED_CLASSES.json 12317


100%|██████████| 3467821/3467821 [00:13<00:00, 259971.89it/s]


SAVED:  1-00022-of-00024_CLOSED_CLASSES.json 33454


100%|██████████| 3384057/3384057 [00:15<00:00, 221430.04it/s]


SAVED:  1-00019-of-00024_CLOSED_CLASSES.json 72093


100%|██████████| 3375570/3375570 [00:08<00:00, 405703.84it/s]


unigram dict empty 1-00005-of-00024_CLOSED_CLASSES.json


100%|██████████| 3103866/3103866 [00:10<00:00, 307941.08it/s]


SAVED:  1-00006-of-00024_CLOSED_CLASSES.json 16626


100%|██████████| 3071926/3071926 [00:12<00:00, 248594.39it/s]


SAVED:  1-00021-of-00024_CLOSED_CLASSES.json 41723


100%|██████████| 3402459/3402459 [00:11<00:00, 302775.69it/s]


SAVED:  1-00013-of-00024_CLOSED_CLASSES.json 3739


100%|██████████| 3375649/3375649 [00:19<00:00, 170206.31it/s]


SAVED:  1-00016-of-00024_CLOSED_CLASSES.json 123093


100%|██████████| 3081673/3081673 [00:08<00:00, 370738.10it/s]


unigram dict empty 1-00003-of-00024_CLOSED_CLASSES.json


100%|██████████| 3377697/3377697 [00:09<00:00, 373982.75it/s]


unigram dict empty 1-00004-of-00024_CLOSED_CLASSES.json


100%|██████████| 3118263/3118263 [00:14<00:00, 211875.25it/s]


SAVED:  1-00018-of-00024_CLOSED_CLASSES.json 36568


100%|██████████| 4149670/4149670 [00:21<00:00, 191584.95it/s]


SAVED:  1-00023-of-00024_CLOSED_CLASSES.json 122872


100%|██████████| 3386487/3386487 [00:12<00:00, 275977.22it/s]


SAVED:  1-00011-of-00024_CLOSED_CLASSES.json 4995


100%|██████████| 3408143/3408143 [00:10<00:00, 310660.05it/s]


SAVED:  1-00008-of-00024_CLOSED_CLASSES.json 2891


100%|██████████| 3392643/3392643 [00:14<00:00, 230260.94it/s]


SAVED:  1-00014-of-00024_CLOSED_CLASSES.json 15740


100%|██████████| 3345476/3345476 [00:09<00:00, 343463.04it/s]


unigram dict empty 1-00001-of-00024_CLOSED_CLASSES.json


100%|██████████| 3315859/3315859 [00:31<00:00, 104947.86it/s]


unigram dict empty 1-00002-of-00024_CLOSED_CLASSES.json


100%|██████████| 3403219/3403219 [00:31<00:00, 109608.91it/s]


SAVED:  1-00017-of-00024_CLOSED_CLASSES.json 20180


100%|██████████| 3135145/3135145 [00:10<00:00, 298056.17it/s]


SAVED:  1-00012-of-00024_CLOSED_CLASSES.json 12206


100%|██████████| 3427775/3427775 [00:17<00:00, 200213.01it/s]


SAVED:  1-00020-of-00024_CLOSED_CLASSES.json 77955


100%|██████████| 3384843/3384843 [00:12<00:00, 271381.35it/s]


SAVED:  1-00007-of-00024_CLOSED_CLASSES.json 9092
CPU times: user 10min 27s, sys: 1min 28s, total: 11min 56s
Wall time: 12min 25s


In [16]:
%%time
files = os.listdir(directory)
for file_path in files:
    if '_CLOSED_CLASSES.json' in file_path:
        investigate(directory,file_path)

100%|██████████| 33454/33454 [07:25<00:00, 75.13it/s]   
100%|██████████| 33454/33454 [00:01<00:00, 28331.33it/s]


Birth: 32801 
Death: 31637
SAVED:  1-00022-of-00024_CLOSED_CLASSES_BIRTH.json 32801


100%|██████████| 2891/2891 [00:00<00:00, 31859.52it/s]

SAVED:  1-00022-of-00024_CLOSED_CLASSES_DEATH.json 31637



100%|██████████| 2891/2891 [00:00<00:00, 32852.34it/s]
  0%|          | 0/16626 [00:00<?, ?it/s]

Birth: 2848 
Death: 2553
SAVED:  1-00008-of-00024_CLOSED_CLASSES_BIRTH.json 2848
SAVED:  1-00008-of-00024_CLOSED_CLASSES_DEATH.json 2553


100%|██████████| 16626/16626 [00:00<00:00, 29536.96it/s]
100%|██████████| 16626/16626 [00:00<00:00, 31782.50it/s]
  0%|          | 0/3739 [00:00<?, ?it/s]

Birth: 16410 
Death: 15352
SAVED:  1-00006-of-00024_CLOSED_CLASSES_BIRTH.json 16410
SAVED:  1-00006-of-00024_CLOSED_CLASSES_DEATH.json 15352


100%|██████████| 3739/3739 [00:00<00:00, 31047.33it/s]
100%|██████████| 3739/3739 [00:00<00:00, 23591.58it/s]


Birth: 3673 
Death: 3386
SAVED:  1-00013-of-00024_CLOSED_CLASSES_BIRTH.json 3673
SAVED:  1-00013-of-00024_CLOSED_CLASSES_DEATH.json 3386


100%|██████████| 41723/41723 [00:01<00:00, 28121.77it/s]
100%|██████████| 41723/41723 [00:01<00:00, 25363.08it/s]


Birth: 40920 
Death: 39239
SAVED:  1-00021-of-00024_CLOSED_CLASSES_BIRTH.json 40920


  0%|          | 0/12317 [00:00<?, ?it/s]

SAVED:  1-00021-of-00024_CLOSED_CLASSES_DEATH.json 39239


100%|██████████| 12317/12317 [00:00<00:00, 30507.55it/s]
100%|██████████| 12317/12317 [00:00<00:00, 24559.69it/s]


Birth: 12054 
Death: 11014
SAVED:  1-00010-of-00024_CLOSED_CLASSES_BIRTH.json 12054
SAVED:  1-00010-of-00024_CLOSED_CLASSES_DEATH.json 11014


100%|██████████| 36568/36568 [00:01<00:00, 28902.21it/s]
100%|██████████| 36568/36568 [00:01<00:00, 24760.19it/s]


Birth: 35615 
Death: 34482
SAVED:  1-00018-of-00024_CLOSED_CLASSES_BIRTH.json 35615
SAVED:  1-00018-of-00024_CLOSED_CLASSES_DEATH.json 34482


100%|██████████| 123093/123093 [00:04<00:00, 26986.00it/s]
100%|██████████| 123093/123093 [00:04<00:00, 25892.08it/s]


Birth: 120170 
Death: 116331
SAVED:  1-00016-of-00024_CLOSED_CLASSES_BIRTH.json 120170


  0%|          | 0/13055 [00:00<?, ?it/s]

SAVED:  1-00016-of-00024_CLOSED_CLASSES_DEATH.json 116331


100%|██████████| 13055/13055 [00:00<00:00, 30972.35it/s]
100%|██████████| 13055/13055 [00:00<00:00, 27336.75it/s]


Birth: 12783 
Death: 11938
SAVED:  1-00015-of-00024_CLOSED_CLASSES_BIRTH.json 12783
SAVED:  1-00015-of-00024_CLOSED_CLASSES_DEATH.json 11938


100%|██████████| 15740/15740 [00:00<00:00, 30886.30it/s]
100%|██████████| 15740/15740 [00:00<00:00, 31911.05it/s]


Birth: 15418 
Death: 14603
SAVED:  1-00014-of-00024_CLOSED_CLASSES_BIRTH.json 15418
SAVED:  1-00014-of-00024_CLOSED_CLASSES_DEATH.json 14603


100%|██████████| 20180/20180 [00:00<00:00, 29395.45it/s]
100%|██████████| 20180/20180 [00:00<00:00, 27905.07it/s]


Birth: 19793 
Death: 19037
SAVED:  1-00017-of-00024_CLOSED_CLASSES_BIRTH.json 19793
SAVED:  1-00017-of-00024_CLOSED_CLASSES_DEATH.json 19037


100%|██████████| 72093/72093 [00:02<00:00, 28396.24it/s]
100%|██████████| 72093/72093 [00:02<00:00, 24230.70it/s]


Birth: 70117 
Death: 68116


  0%|          | 0/4995 [00:00<?, ?it/s]

SAVED:  1-00019-of-00024_CLOSED_CLASSES_BIRTH.json 70117
SAVED:  1-00019-of-00024_CLOSED_CLASSES_DEATH.json 68116


100%|██████████| 4995/4995 [00:00<00:00, 32030.90it/s]
100%|██████████| 4995/4995 [00:00<00:00, 32678.10it/s]


Birth: 4941 
Death: 4469
SAVED:  1-00011-of-00024_CLOSED_CLASSES_BIRTH.json 4941
SAVED:  1-00011-of-00024_CLOSED_CLASSES_DEATH.json 4469


100%|██████████| 77955/77955 [00:02<00:00, 28650.48it/s]
100%|██████████| 77955/77955 [00:02<00:00, 26408.39it/s]


Birth: 76363 
Death: 73355
SAVED:  1-00020-of-00024_CLOSED_CLASSES_BIRTH.json 76363


  0%|          | 0/12206 [00:00<?, ?it/s]

SAVED:  1-00020-of-00024_CLOSED_CLASSES_DEATH.json 73355


100%|██████████| 12206/12206 [00:00<00:00, 29001.07it/s]
100%|██████████| 12206/12206 [00:00<00:00, 30199.57it/s]
  0%|          | 0/9092 [00:00<?, ?it/s]

Birth: 12048 
Death: 11039
SAVED:  1-00012-of-00024_CLOSED_CLASSES_BIRTH.json 12048
SAVED:  1-00012-of-00024_CLOSED_CLASSES_DEATH.json 11039


100%|██████████| 9092/9092 [00:00<00:00, 29648.27it/s]
100%|██████████| 9092/9092 [00:00<00:00, 31448.68it/s]
  0%|          | 0/4368 [00:00<?, ?it/s]

Birth: 8955 
Death: 8420
SAVED:  1-00007-of-00024_CLOSED_CLASSES_BIRTH.json 8955
SAVED:  1-00007-of-00024_CLOSED_CLASSES_DEATH.json 8420


100%|██████████| 4368/4368 [00:00<00:00, 29291.08it/s]
100%|██████████| 4368/4368 [00:00<00:00, 32340.48it/s]


Birth: 4293 
Death: 3945
SAVED:  1-00009-of-00024_CLOSED_CLASSES_BIRTH.json 4293
SAVED:  1-00009-of-00024_CLOSED_CLASSES_DEATH.json 3945


100%|██████████| 122872/122872 [00:04<00:00, 27815.97it/s]
100%|██████████| 122872/122872 [00:04<00:00, 27492.63it/s]


Birth: 118388 
Death: 115411
SAVED:  1-00023-of-00024_CLOSED_CLASSES_BIRTH.json 118388
SAVED:  1-00023-of-00024_CLOSED_CLASSES_DEATH.json 115411
CPU times: user 3min 10s, sys: 4min 20s, total: 7min 30s
Wall time: 9min 58s
