# Goal: Investigate birth and death among closed classes of words
1. Load the \*\_CLOSED_CLASSES.json files
2. Separate the word from the part of speech and form JSON of form  
        {unigram: {pos:'pos', 
                   birth_years:[year1, year2, ...],
                   death_years:[year1, year2, ...]}
            ...}
3. Concatenate the final dictionaries
4. Save as a single JSON

Available parts of speech:
- _PRON_	pronoun
- _DET_	determiner or article
- _ADP_	an adposition: either a preposition or a postposition
- _CONJ_	conjunction
- _PRT_	particle

## (1) Load the \*\_CLOSED_CLASSES.json files

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import re
#For the Google POS tagging
underscore = re.compile('_{1}')

In [2]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [3]:
def normalize(ngrams):
    years = [str(i) for i in range(1800,2020)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [4]:
def smoothing(unigram_dict, years,smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()
    return df
    #years_map = {i:int(year) for i, year in enumerate(df.index)}
    #ngrams = df.to_dict(orient = 'list')
    #return ngrams, years_map

In [5]:
directory = './Ngrams/'
files = os.listdir(directory)
for file_path in files:
    if '_CLOSED_CLASSES.json' in file_path:
        ngrams = open_json(directory,file_path)
        unigram_dict, years = normalize(ngrams)
        del ngrams
        df = smoothing(unigram_dict, years)
        del unigram_dict
        del years
        break

100%|██████████| 33454/33454 [00:01<00:00, 27541.10it/s]


In [6]:
df

Unnamed: 0,rk_DET,responseof_ADP,supportby_ADP,teachesthat_ADP,sotrudnichestvo_DET,staphylomycin_ADP,systdme_DET,tasksthat_DET,slouchy_ADP,solitario_ADP,...,studer_ADP,tapu_ADP,silverband_CONJ,squamatus_ADP,tataa_DET,souverain_ADP,sonle_ADP,swastika_ADP,studiesand_CONJ,secondaryand_CONJ
1802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,5.6,3.2,0.4,2.2,3.8,0.0,0.0,1.6,18.2,6.4,...,1.6,0.6,1.4,4.6,0.0,1.4,0.4,32.2,1.0,3.6
2014,4.8,3.2,0.4,1.8,4.2,0.0,0.0,1.4,16.8,7.0,...,1.2,0.6,0.6,4.8,0.0,1.4,0.4,29.8,1.4,2.8
2015,3.8,3.0,0.4,10.6,5.8,0.8,0.0,1.0,22.0,6.8,...,1.2,0.6,0.6,5.2,0.0,1.8,0.4,29.6,0.8,1.8
2016,0.6,3.2,0.0,11.0,3.4,0.8,0.0,1.0,28.0,7.6,...,0.6,0.0,0.0,5.4,0.0,3.6,0.0,25.2,0.8,1.6


In [None]:
def anaylze_birth_and_death(ngrams,years_map):
    ngrams_analyzed = {}
    
    for unigram in tqdm(ngrams.keys()):
        l = ngrams[unigram]
        birth_years, death_years = [],[]
        for i in range(len(l)-1):
            #Birth
            if l[i]==0 and l[i+1]!=0:
                birth_years.append(years_map[i+1])
            #Death
            if l[i]!=0 and l[i+1]==0:
                death_years.append(years_map[i])
            #Disregarding death in the final year
        if len(birth_years)+len(death_years)>0:
            #Replace the 
            word_pos = underscore.split(unigram)
            ngrams_analyzed[word_pos[0]] = {'POS':word_pos[1],
                                            'birth_years':birth_years,
                                            'death_years':death_years}
    return ngrams_analyzed

In [None]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [None]:
def main(directory):
    final_dict = {}
    
    directory = './Ngrams/'
    files = os.listdir(directory)
    for file_path in files:
        if '_CLOSED_CLASSES.json' in file_path:
            ngrams = open_json(directory,file_path)
            unigram_dict, years = normalize(ngrams)
            del ngrams
            ngrams, years_map = smoothing(unigram_dict, years)
            del unigram_dict
            del years
            ngrams_analyzed = anaylze_birth_and_death(ngrams,years_map)
            del ngrams
            del years_map
            final_dict.update(ngrams_analyzed)
    save_json(final_dict,directory,'CLOSED_CLASSES_ANALYZED')

## Run Everything

In [None]:
%%time
main(directory = './Ngrams/')