# Defining and calculating lexicon size

### Purpose
The purpose of this investigation is to define the size of the lexicon over any arbitrary time interval. What is language? What is a lexicon?  
<br></br>
The *a priori* assumption is that a language exists in an open set of time(s) with members (words, morphemes, lexemes, grammatical structures, idioms, etc.) who exist at at least one time step *t* within that set. However, I posit that a lexicon can only be defined in a closed set of time(s).  
<br></br>
I define the lexicon to be the subset of language whose members (lexemes) must be true for all time steps *t* in T = {t for t in range(t<sub>i</sub>,t<sub>f</sub>)}; where t<sub>i</sub> is initial time (lower bound of variable interval), t<sub>f</sub> is final time (upper bound of variable interval), and the size of time step *t* is a an arbitrary distinction that can be defined by the data, as time is likely a continuous variable.  
<br></br>
Hence the size of the lexicon will be the cardinality of the lexicon set.

## Goals:
1. Load the \*\-COMPLETE.json files
2. Form dictionary of form  
        {unigram: {frequency: sum_usage/total_usage of all lexemes in time interval,
                   sum_usage: sum total of lexeme counts across time interval,
                   median_usage: median lexeme counts over time interval,
                   mean_usage: average lexeme counts per year over time interval,
                   max_usage: maximum usage of lexeme at single year in time interval,
                   min_usage: minimum usage of lexeme at single year in time interval}
            ...}  
3. Concatenate the dictionaries
4. Save as a single JSON

In [None]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import statistics
from collections import OrderedDict

In [None]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        json_dictionary = json.load(f)
        f.close()
    return json_dictionary

In [None]:
def save_json(dictionary,directory,file_name):
    output = file_name[:-3]
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [None]:
def normalize(ngrams, t_start, t_end):
    years = [str(i) for i in range(t_start,t_end+1)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [None]:
def smoothing(unigram_dict, years, smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()
    
    years = list(df.index.values)
    ngrams = df.to_dict(orient = 'list')
    return ngrams, years

In [None]:
def return_lexicon(ngrams):
    lexicon = dict()
    total_usage = 0
    for unigram in tqdm(ngrams.keys()):
        frequency_list = ngrams[unigram]
        #If there are no zeroes in the list of frequencies for that unigram
        if 0 not in frequency_list:
            sum_usage = sum(frequency_list)
            median_usage = statistics.median(frequency_list)
            mean_usage = statistics.mean(frequency_list)
            max_usage = max(frequency_list)
            min_usage = min(frequency_list)
            
            total_usage+=sum_usage
            lexicon[unigram] = {'sum_usage':sum_usage,
                                'median_usage':median_usage,
                                'mean_usage':mean_usage,
                                'max_usage':max_usage,
                                'min_usage':min_usage}
        
    return lexicon, total_usage

In [None]:
def add_frequency(lexicon,total_usage):
    for lexeme in tqdm(lexicon.keys()):
        lexicon[lexeme]['frequency'] = lexicon[lexeme]['sum_usage']/total_usage
    return lexicon

In [None]:
def main(directory, t_start, t_end, t_step):
    final_dict = dict()
    files = os.listdir(os.path.abspath(directory))
    for file_name in files:
        if '-COMPLETE.json.gz' in file_name:
            print(file_path)
            ngrams = open_gzipped_json(directory,file_name)
            print('Opened json')
            unigram_dict, years = normalize(ngrams, t_start, t_end)
            del ngrams
            print('Normalized')
            ngrams, years = smoothing(unigram_dict, years, t_step)
            del unigram_dict
            print('Smoothed')
            lexicon, total_usage = return_lexicon(ngrams)
            del ngrams
            print('Got lexicon')
            lexicon = add_frequency(lexicon, total_usage)
        
            final_dict.update(lexicon)
            
    save_json(final_dict,directory,'LEXICON_YEAR_'+str(years[0])+'-'+str(years[-1])+'_STEP_'+t_step)
    return final_dict

In [None]:
%%time
t_start = 1995 #Input the year that you want to consider as the lower bound of the lexicon
t_end = 2018 #Input the year that you want to consider as the upper bound of the lexicon 
t_step = 3 #Smoothing is a more advanced way to increase the time step (and is code reuse). 
directory = 'C:\\Users\\wzkar\\Documents\\Linguistic Research\\Ngrams\\unigram_data\\'
if t_start>=t_end:
    raise ValueError('Re-Input start year and end year')
else:
    t_interval = t_end-t_start
    print('Range of time for calculating lexicon size is', t_interval-(t_step-1),'years')
    lexicon = main(directory, t_start, t_end, t_step)

In [None]:
print('Size of Lexicon is ',len(lexicon.keys())

In [None]:
def top_counts(dictionary,count_type,num_hits=25,head = True):
    return OrderedDict(sorted(dictionary.items(), key=lambda x: x[1][count_type], reverse=head)[:num_hits])

In [None]:
top_frequency = top_counts(lexicon,'frequency')

In [None]:
top_frequency

In [None]:
top_median = top_counts(lexicon,'median_usage')

In [None]:
top_median

In [None]:
top_max = top_counts(lexicon,'max_usage')

In [None]:
top_max