# Goal: Investigate birth and death among closed classes of words
1. Load the \*\_CLOSED_CLASSES.json files
2. Separate the word from the part of speech and form JSON of form  
        {unigram: {pos:'pos',
                   max: max_usage,
                   median_all: median_all,
                   median_in_use:median_in_use,
                   mean_all: mean_all,
                   mean_in_use:mean_in_use,
                   birth_years: [year1, year2, ...],
                   death_years: [year1, year2, ...]}
            ...}
    Where 
    - `max` is the maximum frequency of usage over the entire time period 
    - `me(di)an_all` is the me(di)an of the frequencies of usage at all points in the time interval.
    - `me(di)an_in_use` is the me(di)an of the frequencies of usage only when actually in use (frequency >0)  
    
    
3. Concatenate the final dictionaries
4. Save as a single JSON

Available parts of speech:
- _PRON_	pronoun
- _DET_	determiner or article
- _ADP_	an adposition: either a preposition or a postposition
- _CONJ_	conjunction
- _PRT_	particle

## Load the \*\_CLOSED_CLASSES.json files

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
#For the Google POS tagging
underscore = re.compile('_{1}')
import statistics

In [2]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [3]:
def normalize(ngrams):
    years = [str(i) for i in range(1800,2020)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [4]:
def smoothing(unigram_dict, years, smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()

    years_map = {i:int(year) for i, year in enumerate(df.index)}
    ngrams = df.to_dict(orient = 'list')
    return ngrams, years_map

In [5]:
def analyze_birth_and_death(ngrams,years_map):
    ngrams_analyzed = {}
    
    for unigram in tqdm(ngrams.keys()):
        frequency_list = ngrams[unigram]
        frequency_in_use_list = [f for f in frequency_list if f>0]
        if frequency_in_use_list: #only proceed if there is some value that is greater than 0 in the frequency list
            max_usage = max(frequency_list)
            median_all = statistics.median(frequency_list)
            median_in_use = statistics.median(frequency_in_use_list)
            mean_all = statistics.mean(frequency_list)
            mean_in_use = statistics.mean(frequency_in_use_list)

            birth_years, death_years = [],[]
            for i in range(len(frequency_list)-1):
                #Birth
                if frequency_list[i]==0 and frequency_list[i+1]!=0:
                    birth_years.append(years_map[i+1])
                #Death
                if frequency_list[i]!=0 and frequency_list[i+1]==0:
                    death_years.append(years_map[i])
                #Disregarding death in the final year

            if len(birth_years)+len(death_years)>0:
                #Replace the tagged unigram with the word and place POS separately
                word_pos = underscore.split(unigram)
                ngrams_analyzed[word_pos[0]] = {'POS':word_pos[1],
                                                'max_usage':max_usage,
                                                'median_all':median_all,
                                                'median_in_use':median_in_use,
                                                'mean_all':mean_all,
                                                'mean_in_use':mean_in_use,
                                                'birth_years':birth_years,
                                                'death_years':death_years}
        else:
            pass
            #print(unigram,'had no instances of usage after smoothing.')
            
    return ngrams_analyzed

In [6]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

## Run Everything

In [7]:
%%time
final_dict = {}

directory = '../Ngrams/unigram_data/'
files = os.listdir(directory)

for file_path in files:
    if '_CLOSED_CLASSES.json' in file_path:
        ngrams = open_json(directory,file_path)
        print('Opened',file_path)
        unigram_dict, years = normalize(ngrams)
        print('Normalized')
        del ngrams
        ngrams, years_map = smoothing(unigram_dict, years)
        print('Smoothed')
        del unigram_dict
        del years
        ngrams_analyzed = analyze_birth_and_death(ngrams,years_map)
        print('Analyzed birth and death')
        del ngrams
        del years_map
        final_dict.update(ngrams_analyzed)
        del ngrams_analyzed
save_json(final_dict,directory,'CLOSED_CLASSES_SORTABLE')

Opened 1-00006-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 16626/16626 [00:00<00:00, 19467.80it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 16626/16626 [00:05<00:00, 3175.09it/s]


Analyzed birth and death
Opened 1-00007-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████████| 9092/9092 [00:00<00:00, 18436.71it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████████| 9092/9092 [00:02<00:00, 3149.15it/s]


Analyzed birth and death
Opened 1-00008-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████████| 2891/2891 [00:00<00:00, 19312.61it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████████| 2891/2891 [00:00<00:00, 3331.46it/s]


Analyzed birth and death
Opened 1-00009-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████████| 4368/4368 [00:00<00:00, 21408.19it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████████| 4368/4368 [00:01<00:00, 3018.17it/s]


Analyzed birth and death
Opened 1-00010-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 12317/12317 [00:00<00:00, 22474.74it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 12317/12317 [00:04<00:00, 3005.80it/s]


Analyzed birth and death
Opened 1-00011-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████████| 4995/4995 [00:00<00:00, 17390.01it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████████| 4995/4995 [00:01<00:00, 3244.28it/s]


Analyzed birth and death
Opened 1-00012-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 12206/12206 [00:00<00:00, 20733.65it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 12206/12206 [00:03<00:00, 3198.34it/s]


Analyzed birth and death
Opened 1-00013-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████████| 3739/3739 [00:00<00:00, 19022.42it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████████| 3739/3739 [00:01<00:00, 3165.65it/s]


Analyzed birth and death
Opened 1-00014-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 15740/15740 [00:00<00:00, 21261.95it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 15740/15740 [00:05<00:00, 2901.57it/s]


Analyzed birth and death
Opened 1-00015-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 13055/13055 [00:00<00:00, 21320.00it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 13055/13055 [00:04<00:00, 3058.74it/s]


Analyzed birth and death
Opened 1-00016-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████| 123093/123093 [00:06<00:00, 19678.59it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████| 123093/123093 [00:40<00:00, 3007.98it/s]


Analyzed birth and death
Opened 1-00017-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 20180/20180 [00:00<00:00, 21364.55it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 20180/20180 [00:06<00:00, 2955.05it/s]


Analyzed birth and death
Opened 1-00018-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 36568/36568 [00:01<00:00, 19040.00it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 36568/36568 [00:12<00:00, 2893.08it/s]


Analyzed birth and death
Opened 1-00019-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 72093/72093 [00:03<00:00, 21245.00it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 72093/72093 [00:25<00:00, 2841.49it/s]


Analyzed birth and death
Opened 1-00020-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 77955/77955 [00:04<00:00, 16587.82it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 77955/77955 [00:26<00:00, 2940.61it/s]


Analyzed birth and death
Opened 1-00021-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 41723/41723 [00:02<00:00, 18436.13it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 41723/41723 [00:14<00:00, 2861.80it/s]


Analyzed birth and death
Opened 1-00022-of-00024_CLOSED_CLASSES.json


100%|█████████████████████████████████████████████████████████████████████████| 33454/33454 [00:01<00:00, 18652.14it/s]


Normalized
Smoothed


100%|██████████████████████████████████████████████████████████████████████████| 33454/33454 [00:11<00:00, 2821.82it/s]


Analyzed birth and death
Opened 1-00023-of-00024_CLOSED_CLASSES.json


100%|███████████████████████████████████████████████████████████████████████| 122872/122872 [00:06<00:00, 19442.87it/s]


Normalized
Smoothed


100%|████████████████████████████████████████████████████████████████████████| 122872/122872 [00:43<00:00, 2822.95it/s]


Analyzed birth and death
SAVED:  CLOSED_CLASSES_SORTABLE.json 440403
Wall time: 6min 23s
