# Goal: Investigate birth and death among closed classes of words
1. Load the \*\_CLOSED_CLASSES.json files
2. Separate the word from the part of speech and form JSON of form  
        {unigram: {pos:'pos', 
                   birth_years:[year1, year2, ...],
                   death_years:[year1, year2, ...]}
            ...}
3. Concatenate the final dictionaries
4. Save as a single JSON

Available parts of speech:
- _PRON_	pronoun
- _DET_	determiner or article
- _ADP_	an adposition: either a preposition or a postposition
- _CONJ_	conjunction
- _PRT_	particle

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

## (1) Load the \*\_CLOSED_CLASSES.json files

In [2]:
import os
import re
#For the Google POS tagging
underscore = re.compile('_{1}')

In [3]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [4]:
def normalize(ngrams):
    years = [str(i) for i in range(1800,2020)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [5]:
def smoothing(unigram_dict, years,smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()
    years_map = {i:int(year) for i, year in enumerate(df.index)}
    ngrams = df.to_dict(orient = 'list')
    return ngrams, years_map

In [6]:
def anaylze_birth_and_death(ngrams,years_map):
    ngrams_analyzed = {}
    
    for unigram in tqdm(ngrams.keys()):
        l = ngrams[unigram]
        birth_years, death_years = [],[]
        for i in range(len(l)-1):
            #Birth
            if l[i]==0 and l[i+1]!=0:
                birth_years.append(years_map[i+1])
            #Death
            if l[i]!=0 and l[i+1]==0:
                death_years.append(years_map[i])
            #Disregarding death in the final year
        if len(birth_years)+len(death_years)>0:
            #Replace the 
            word_pos = underscore.split(unigram)
            ngrams_analyzed[word_pos[0]] = {'POS':word_pos[1],
                                            'birth_years':birth_years,
                                            'death_years':death_years}
    return ngrams_analyzed

In [7]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [8]:
def main(directory):
    final_dict = {}
    
    directory = './Ngrams/'
    files = os.listdir(directory)
    for file_path in files:
        if '_CLOSED_CLASSES.json' in file_path:
            ngrams = open_json(directory,file_path)
            unigram_dict, years = normalize(ngrams)
            del ngrams
            ngrams, years_map = smoothing(unigram_dict, years)
            del unigram_dict
            del years
            ngrams_analyzed = anaylze_birth_and_death(ngrams,years_map)
            del ngrams
            del years_map
            final_dict.update(ngrams_analyzed)
    save_json(final_dict,directory,'CLOSED_CLASSES_ANALYZED')

## Run Everything

In [9]:
%%time
main(directory = './Ngrams/')

100%|██████████| 33454/33454 [00:01<00:00, 24807.85it/s]
100%|██████████| 33454/33454 [00:01<00:00, 27170.99it/s]
100%|██████████| 2891/2891 [00:00<00:00, 29914.16it/s]
100%|██████████| 2891/2891 [00:00<00:00, 26810.47it/s]
100%|██████████| 16626/16626 [00:00<00:00, 26689.98it/s]
100%|██████████| 16626/16626 [00:00<00:00, 23398.90it/s]
100%|██████████| 3739/3739 [00:00<00:00, 28731.43it/s]
100%|██████████| 3739/3739 [00:00<00:00, 28419.39it/s]
100%|██████████| 41723/41723 [00:01<00:00, 27256.38it/s]
100%|██████████| 41723/41723 [00:01<00:00, 25297.12it/s]
100%|██████████| 12317/12317 [00:00<00:00, 17051.31it/s]
100%|██████████| 12317/12317 [00:00<00:00, 21219.42it/s]
100%|██████████| 36568/36568 [00:01<00:00, 28449.36it/s]
100%|██████████| 36568/36568 [00:01<00:00, 25174.95it/s]
100%|██████████| 123093/123093 [00:04<00:00, 26144.71it/s]
100%|██████████| 123093/123093 [00:05<00:00, 21982.48it/s]
100%|██████████| 13055/13055 [00:00<00:00, 28837.56it/s]
100%|██████████| 13055/13055 [00:00

SAVED:  CLOSED_CLASSES_ANALYZED.json 440403
CPU times: user 2min 47s, sys: 5 s, total: 2min 52s
Wall time: 2min 55s
