# Goal: Investigate birth and death among closed classes of words
1. Load the \*\_CLOSED_CLASSES.json files
2. Separate the word from the part of speech and form JSON of form  
        {unigram: {pos:'pos',
                   max: max_usage,
                   median_all: median_all,
                   median_in_use:median_in_use,
                   mean_all: mean_all,
                   mean_in_use:mean_in_use,
                   birth_years: [year1, year2, ...],
                   death_years: [year1, year2, ...]}
            ...}
    Where `me(di)an_all` is the me(di)an of the frequencies of usage at all points in the time interval.
    and `me(di)an_in_use` is the me(di)an of the frequencies of usage only when actually in use (frequency >0)  
    
3. Concatenate the final dictionaries
4. Save as a single JSON

Available parts of speech:
- _PRON_	pronoun
- _DET_	determiner or article
- _ADP_	an adposition: either a preposition or a postposition
- _CONJ_	conjunction
- _PRT_	particle

## Load the \*\_CLOSED_CLASSES.json files

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
#For the Google POS tagging
underscore = re.compile('_{1}')
import statistics

In [2]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [3]:
def normalize(ngrams):
    years = [str(i) for i in range(1800,2020)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [4]:
def smoothing(unigram_dict, years, smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()

    years_map = {i:int(year) for i, year in enumerate(df.index)}
    ngrams = df.to_dict(orient = 'list')
    return ngrams, years_map

In [5]:
def analyze_birth_and_death(ngrams,years_map):
    ngrams_analyzed = {}
    
    for unigram in tqdm(ngrams.keys()):
        frequency_list = ngrams[unigram]
        frequency_in_use_list = [f for f in frequency_list if f>0]
        if frequency_in_use_list: #only proceed if there is some value that is greater than 0 in the frequency list
            max_usage = max(frequency_list)
            median_all = statistics.median(frequency_list)
            median_in_use = statistics.median(frequency_in_use_list)
            mean_all = statistics.mean(frequency_list)
            mean_in_use = statistics.mean(frequency_in_use_list)

            birth_years, death_years = [],[]
            for i in range(len(frequency_list)-1):
                #Birth
                if frequency_list[i]==0 and frequency_list[i+1]!=0:
                    birth_years.append(years_map[i+1])
                #Death
                if frequency_list[i]!=0 and frequency_list[i+1]==0:
                    death_years.append(years_map[i])
                #Disregarding death in the final year

            if len(birth_years)+len(death_years)>0:
                #Replace the tagged unigram with the word and place POS separately
                word_pos = underscore.split(unigram)
                ngrams_analyzed[word_pos[0]] = {'POS':word_pos[1],
                                                'max_usage':max_usage,
                                                'median_all':median_all,
                                                'median_in_use':median_in_use,
                                                'mean_all':mean_all,
                                                'mean_in_use':mean_in_use,
                                                'birth_years':birth_years,
                                                'death_years':death_years}
        else:
            pass
            #print(unigram,'had no instances of usage after smoothing.')
            
    return ngrams_analyzed

In [6]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [7]:
def main(directory):
    final_dict = {}
    
    directory = './Ngrams/'
    files = os.listdir(directory)
    for file_path in files:
        if '_CLOSED_CLASSES.json' in file_path:
            ngrams = open_json(directory,file_path)
            unigram_dict, years = normalize(ngrams)
            del ngrams
            ngrams, years_map = smoothing(unigram_dict, years)
            del unigram_dict
            del years
            ngrams_analyzed = analyze_birth_and_death(ngrams,years_map)
            del ngrams
            del years_map
            final_dict.update(ngrams_analyzed)
    save_json(final_dict,directory,'CLOSED_CLASSES_SORTABLE')

## Run Everything

In [8]:
%%time
main(directory = './Ngrams/')

100%|██████████| 33454/33454 [00:01<00:00, 24039.59it/s]
100%|██████████| 33454/33454 [00:10<00:00, 3129.65it/s]
100%|██████████| 2891/2891 [00:00<00:00, 32063.73it/s]
100%|██████████| 2891/2891 [00:00<00:00, 4131.73it/s]
100%|██████████| 16626/16626 [00:00<00:00, 28058.11it/s]
100%|██████████| 16626/16626 [00:04<00:00, 4135.79it/s]
100%|██████████| 3739/3739 [00:00<00:00, 30881.16it/s]
100%|██████████| 3739/3739 [00:00<00:00, 4594.98it/s]
100%|██████████| 41723/41723 [00:01<00:00, 30132.29it/s]
100%|██████████| 41723/41723 [00:09<00:00, 4363.87it/s]
100%|██████████| 12317/12317 [00:00<00:00, 31664.29it/s]
100%|██████████| 12317/12317 [00:02<00:00, 4204.11it/s]
100%|██████████| 36568/36568 [00:01<00:00, 31254.78it/s]
100%|██████████| 36568/36568 [00:08<00:00, 4105.91it/s]
100%|██████████| 123093/123093 [00:04<00:00, 27879.35it/s]
100%|██████████| 123093/123093 [00:28<00:00, 4295.92it/s]
100%|██████████| 13055/13055 [00:00<00:00, 30922.10it/s]
100%|██████████| 13055/13055 [00:02<00:00, 

SAVED:  CLOSED_CLASSES_SORTABLE.json 440403
CPU times: user 4min 34s, sys: 5.38 s, total: 4min 40s
Wall time: 4min 41s
