# Defining and calculating lexicon size

### Purpose
The purpose of this investigation is to define the size of the lexicon over any arbitrary time interval. What is language? What is a lexicon?  
<br></br>
The *a priori* assumption is that a language exists in an open set of time(s) with members (words, morphemes, lexemes, grammatical structures, idioms, etc.) who exist at at least one time step *t* within that set. However, I posit that a lexicon can only be defined in a closed set of time(s).  
<br></br>
I define the lexicon to be the subset of language whose members (lexemes) must be true for all time steps *t* in T = {t for t in range(t<sub>i</sub>,t<sub>f</sub>)}; where t<sub>i</sub> is initial time (lower bound of variable interval), t<sub>f</sub> is final time (upper bound of variable interval), and the size of time step *t* is a an arbitrary distinction that can be defined by the data, as time is likely a continuous variable.  
<br></br>
Hence the size of the lexicon will be the cardinality of the lexicon set.

## Goals:
1. Load the \*\-COMPLETE.json files
2. Form dictionary of form  
        {unigram: {frequency: sum_usage/total_usage of all lexemes in time interval,
                   sum_usage: sum total of lexeme counts across time interval,
                   median_usage: median lexeme counts over time interval,
                   mean_usage: average lexeme counts per year over time interval,
                   max_usage: maximum usage of lexeme at single year in time interval,
                   min_usage: minimum usage of lexeme at single year in time interval}
            ...}  
3. Concatenate the dictionaries
4. Save as a single JSON

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import statistics
from collections import OrderedDict

In [2]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        json_dictionary = json.load(f)
        f.close()
    return json_dictionary

In [3]:
def save_json(dictionary,directory,file_name):
    output = file_name+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output)

In [4]:
def normalize(ngrams, t_start, t_end):
    years = [str(i) for i in range(t_start,t_end+1)]
    unigram_dict = dict()
    for word in tqdm(ngrams.keys()):
        match_count_by_year = []
        for year in years:
            if year in ngrams[word].keys():
                match_count_by_year.append(ngrams[word][year])
            else:
                #Zeroes are necessary for smoothing
                match_count_by_year.append(0)
        unigram_dict[word] = match_count_by_year
    return unigram_dict, years

In [5]:
def smoothing(unigram_dict, years, smoothing = 5):
    df = pd.DataFrame.from_dict(unigram_dict #take in the dictionary
                    ).rolling(smoothing,center=True #create frames of size 5 (smoothing value), and replace value in middle
                    ).mean( #average accross those frames
                    ).rename({i:years[i] for i in range(len(years))}, axis = 'index' #rename the indices to years
                    ).dropna()
    
    years = list(df.index.values)
    unigram_dict = df.to_dict(orient = 'list')
    
    return unigram_dict, years

In [6]:
def return_sublexicon(unigram_dict):
    sublexicon = dict()
    usage = 0
    for unigram in tqdm(unigram_dict.keys()):
        frequency_list = unigram_dict[unigram]
        #If there are no zeroes in the list of frequencies for that unigram
        if 0 not in frequency_list:
            sum_usage = sum(frequency_list)
            median_usage = statistics.median(frequency_list)
            mean_usage = statistics.mean(frequency_list)
            max_usage = max(frequency_list)
            min_usage = min(frequency_list)
            
            usage+=sum_usage
            sublexicon[unigram] = {'sum_usage':sum_usage,
                                'median_usage':median_usage,
                                'mean_usage':mean_usage,
                                'max_usage':max_usage,
                                'min_usage':min_usage}
        
    return sublexicon, usage

In [7]:
def add_frequency(lexicon,total_usage):
    for lexeme in tqdm(lexicon.keys()):
        lexicon[lexeme]['frequency'] = lexicon[lexeme]['sum_usage']/total_usage
    return lexicon

In [8]:
def main(directory, t_start, t_end, t_step):
    lexicon = dict()
    total_usage = 0
    files = os.listdir(directory)
    for file_name in files:
        if '-COMPLETE.json' in file_name:
            ngrams = open_json(directory,file_name)
            print('Opened ',file_name)
            unigram_dict, years = normalize(ngrams, t_start, t_end)
            del ngrams
            print('Normalized')
            if t_step>1:
                unigram_dict, years = smoothing(unigram_dict, years, t_step)
                print('Smoothed')
            #We only get a sublexicon because each file is only a single piece of the full lexicon.
            sublexicon, usage = return_sublexicon(unigram_dict)
            del unigram_dict
            print('Got sublexicon')
            lexicon.update(sublexicon)
            total_usage+=usage
            del sublexicon #frees up memory for next round, before the compiler gets to it.
    
    print('Adding frequency')
    lexicon = add_frequency(lexicon, total_usage)
    
    save_json(lexicon,directory,str('LEXICON_'+str(years[0])+'-'+str(years[-1])+'_STEP'+str(t_step)))
    print('Frequency added. Lexicon Saved.')
    
    return lexicon

In [9]:
%%time
t_start = 1995 #Input the year that you want to consider as the lower bound of the lexicon
t_end = 2018 #Input the year that you want to consider as the upper bound of the lexicon 
#This is usually an odd number (3 years would average the frequency of that year with the frequencies of the year before and after it)
t_step = 1 #Smoothing is a more advanced(but very slow) way to implement the time step (and is code reuse). 
directory = '../Ngrams/unigram_data/'

if t_start>=t_end:
    raise ValueError('Re-Input start year and end year')
elif type(t_step)!=int or type(t_step)!=int or type(t_step)!=int:
    raise ValueError('Re-Input all values as integers')
else:
    t_interval = t_end-t_start
    print("Creating Lexicon from year",t_start,'-',t_end,'with step',t_step)
    print('Range of time for calculating lexicon size is', t_interval-(t_step-1),'years\n')
    lexicon = main(directory, t_start, t_end, t_step)

Range of time for calculating lexicon size is 23 years
1-00006-of-00024-COMPLETE.json
Opened json


100%|██████████████████████████████████████████████████████████████████████| 582617/582617 [00:03<00:00, 164629.94it/s]


Normalized


100%|██████████████████████████████████████████████████████████████████████| 582617/582617 [00:01<00:00, 347304.67it/s]


Got sublexicon
1-00007-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1162149/1162149 [00:07<00:00, 160912.98it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1162149/1162149 [00:03<00:00, 333192.92it/s]


Got sublexicon
1-00008-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1227039/1227039 [00:07<00:00, 154655.23it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1227039/1227039 [00:04<00:00, 299063.54it/s]


Got sublexicon
1-00009-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1081183/1081183 [00:07<00:00, 150462.79it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1081183/1081183 [00:03<00:00, 298851.24it/s]


Got sublexicon
1-00010-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1192753/1192753 [00:07<00:00, 150509.90it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1192753/1192753 [00:03<00:00, 299898.51it/s]


Got sublexicon
1-00011-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1205150/1205150 [00:08<00:00, 150101.07it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1205150/1205150 [00:04<00:00, 282057.67it/s]


Got sublexicon
1-00012-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1082701/1082701 [00:07<00:00, 138552.29it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1082701/1082701 [00:03<00:00, 306059.87it/s]


Got sublexicon
1-00013-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1214351/1214351 [00:08<00:00, 146869.01it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1214351/1214351 [00:03<00:00, 314719.72it/s]


Got sublexicon
1-00014-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1351139/1351139 [00:09<00:00, 136493.47it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1351139/1351139 [00:04<00:00, 296757.95it/s]


Got sublexicon
1-00015-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1058064/1058064 [00:07<00:00, 143326.36it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1058064/1058064 [00:03<00:00, 307709.37it/s]


Got sublexicon
1-00016-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1455003/1455003 [00:09<00:00, 149027.91it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1455003/1455003 [00:04<00:00, 331096.96it/s]


Got sublexicon
1-00017-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1378241/1378241 [00:09<00:00, 140730.06it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1378241/1378241 [00:04<00:00, 340121.88it/s]


Got sublexicon
1-00018-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1329481/1329481 [00:09<00:00, 143239.26it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1329481/1329481 [00:03<00:00, 337689.42it/s]


Got sublexicon
1-00019-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1348948/1348948 [00:09<00:00, 148818.46it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1348948/1348948 [00:03<00:00, 352389.38it/s]


Got sublexicon
1-00020-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1488851/1488851 [00:10<00:00, 140193.64it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1488851/1488851 [00:04<00:00, 367621.80it/s]


Got sublexicon
1-00021-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1258884/1258884 [00:08<00:00, 142235.93it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1258884/1258884 [00:03<00:00, 329954.20it/s]


Got sublexicon
1-00022-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1438778/1438778 [00:10<00:00, 141545.45it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1438778/1438778 [00:05<00:00, 286679.07it/s]


Got sublexicon
1-00023-of-00024-COMPLETE.json
Opened json


100%|████████████████████████████████████████████████████████████████████| 1434712/1434712 [00:11<00:00, 127123.84it/s]


Normalized


100%|████████████████████████████████████████████████████████████████████| 1434712/1434712 [00:04<00:00, 292755.14it/s]


Got sublexicon
Adding frequency


100%|████████████████████████████████████████████████████████████████████| 1926813/1926813 [00:01<00:00, 987869.09it/s]


SAVED:  LEXICON_YEAR_1995-2018_STE 1926813
Frequency added. Lexicon Saved.
Wall time: 9min 27s


In [10]:
print('Size of Lexicon is ',len(lexicon.keys()))

Size of Lexicon is  1926813


In [11]:
def top_counts(dictionary,count_type,num_hits=25,head = True):
    return OrderedDict(sorted(dictionary.items(), key=lambda x: x[1][count_type], reverse=head)[:num_hits])

In [12]:
top_frequency = top_counts(lexicon,'frequency')

In [13]:
top_frequency

OrderedDict([('the_DET',
              {'sum_usage': 28164081494,
               'median_usage': 1166314842.0,
               'mean_usage': 1173503395.5833333,
               'max_usage': 1525772265,
               'min_usage': 970419604,
               'frequency': 0.05560282405468271}),
             ('of_ADP',
              {'sum_usage': 17752494369,
               'median_usage': 721687512.0,
               'mean_usage': 739687265.375,
               'max_usage': 929870130,
               'min_usage': 652445551,
               'frequency': 0.035047790255170914}),
             ('and_CONJ',
              {'sum_usage': 14702193028,
               'median_usage': 617442112.0,
               'mean_usage': 612591376.1666666,
               'max_usage': 807922209,
               'min_usage': 491426304,
               'frequency': 0.029025745161546324}),
             ('to_PRT',
              {'sum_usage': 12608939440,
               'median_usage': 538991089.0,
               'mean_usage': 

In [14]:
top_median = top_counts(lexicon,'median_usage')

In [15]:
top_median

OrderedDict([('the_DET',
              {'sum_usage': 28164081494,
               'median_usage': 1166314842.0,
               'mean_usage': 1173503395.5833333,
               'max_usage': 1525772265,
               'min_usage': 970419604,
               'frequency': 0.05560282405468271}),
             ('of_ADP',
              {'sum_usage': 17752494369,
               'median_usage': 721687512.0,
               'mean_usage': 739687265.375,
               'max_usage': 929870130,
               'min_usage': 652445551,
               'frequency': 0.035047790255170914}),
             ('and_CONJ',
              {'sum_usage': 14702193028,
               'median_usage': 617442112.0,
               'mean_usage': 612591376.1666666,
               'max_usage': 807922209,
               'min_usage': 491426304,
               'frequency': 0.029025745161546324}),
             ('to_PRT',
              {'sum_usage': 12608939440,
               'median_usage': 538991089.0,
               'mean_usage': 

In [16]:
top_max = top_counts(lexicon,'max_usage')

In [17]:
top_max

OrderedDict([('the_DET',
              {'sum_usage': 28164081494,
               'median_usage': 1166314842.0,
               'mean_usage': 1173503395.5833333,
               'max_usage': 1525772265,
               'min_usage': 970419604,
               'frequency': 0.05560282405468271}),
             ('of_ADP',
              {'sum_usage': 17752494369,
               'median_usage': 721687512.0,
               'mean_usage': 739687265.375,
               'max_usage': 929870130,
               'min_usage': 652445551,
               'frequency': 0.035047790255170914}),
             ('and_CONJ',
              {'sum_usage': 14702193028,
               'median_usage': 617442112.0,
               'mean_usage': 612591376.1666666,
               'max_usage': 807922209,
               'min_usage': 491426304,
               'frequency': 0.029025745161546324}),
             ('to_PRT',
              {'sum_usage': 12608939440,
               'median_usage': 538991089.0,
               'mean_usage': 