# Goal:
The number of lexemes output by the Closed Classes Analysis notebook was far too great to manually investigate efficiently. Thus, we will make a few hypotheses which may assist in programmatically reducing the qualitative search space.
1. There is a higher likelihood of finding true instances of birth/death in the highest frequency words
2. We will be able to group instances of birth and death by their relative numbers of years (number of birth years - number of death years = 1 for a birthed word, 0 for a dead word. 
3. Words that have only one birth or death year are more likely to be true instances than those with >1 of each.

\*Note: These hypotheses do not affect our linguistic framework nor allow us to conclude that the other data does not have bona fide instances of birth and death i.e. not due to OCR mistakes, spelling diversity (when the spelling refers to the same spoken word), etc. They simply limit how much data we have to look at.

In [1]:
import json
from collections import OrderedDict
data_directory = '../Ngrams/unigram_data/'

In [2]:
def open_json(directory,file_path):
    with open(directory+file_path,'r') as f:
        ngrams = json.load(f)
        f.close()
    return ngrams

In [3]:
def save_json(dictionary,directory,file_path):
    output = file_path+'.json'
    if len(dictionary)>0:
        with open(directory+output, 'w') as f_out:
            json.dump(dictionary, f_out)
        print('SAVED: ',output,len(dictionary))
    else:
        print('unigram dict empty',output,dictionary)

In [4]:
#This allows for easy filtering of the part of speech (POS) in Excel
def save_csv(dictionary,directory,title):
    output = title+'.csv'
    with open(output,'w') as f:
        f.write('lexeme,part_of_speech,max_usage,median_all,median_in_use,mean_all,mean_in_use\n')
        for lexeme, entries in dictionary.items():
            f.write(lexeme+','
                    +entries['POS']+','
                    +str(int(entries['max_usage']))+','
                    +str(int(entries['median_all']))+','
                    +str(int(entries['median_in_use']))+','
                    +str(int(entries['mean_all']))+','
                    +str(int(entries['mean_in_use']))+'\n')
        f.close()
        print('SAVED: ',output,len(dictionary))

In [5]:
ngrams = open_json(data_directory,'CLOSED_CLASSES_SORTABLE.json')

In [6]:
print('Number of lexemes in closed lexical classes with birth and death: ',len(ngrams.keys()))

Number of lexemes in closed lexical classes with birth and death:  440403


### Hypothesis #1: There is a higher likelihood of finding true instances of birth/death in highest frequency words
Will get top counts for all metrics, then combine them into a non-overlapping dictionary. Then will save as .json and .csv files

In [7]:
def top_counts(dictionary,count_type,num_hits=500,head = True):
    return OrderedDict(sorted(dictionary.items(), key=lambda x: x[1][count_type], reverse=head)[:num_hits])

In [8]:
top_max_usage = top_counts(ngrams,'max_usage')

In [9]:
top_median_all = top_counts(ngrams,'median_all')

In [10]:
top_median_in_use = top_counts(ngrams,'median_in_use')

In [11]:
top_mean_all = top_counts(ngrams,'mean_all')

In [12]:
top_mean_in_use = top_counts(ngrams,'mean_in_use')

In [13]:
top_all = {**top_max_usage,**top_median_all,**top_median_in_use,**top_mean_all,**top_mean_in_use}

In [14]:
len(top_all)

1185

In [15]:
list((k,v) for k,v in top_all.items())[:50]

[('bbe',
  {'POS': 'ADP',
   'max_usage': 133568.4,
   'median_all': 4.8,
   'median_in_use': 5.6,
   'mean_all': 1470.3064814814813,
   'mean_in_use': 1595.910552763819,
   'birth_years': [1814, 1820, 1846],
   'death_years': [1818, 1841]}),
 ('nhe',
  {'POS': 'ADP',
   'max_usage': 75803.4,
   'median_all': 4.4,
   'median_in_use': 5.300000000000001,
   'mean_all': 971.6851851851851,
   'mean_in_use': 1093.1458333333333,
   'birth_years': [1803, 1814, 1833, 1840, 1851],
   'death_years': [1807, 1818, 1837, 1849]}),
 ('proton',
  {'POS': 'ADP',
   'max_usage': 32690.0,
   'median_all': 25.3,
   'median_in_use': 54.2,
   'mean_all': 6065.498148148148,
   'mean_in_use': 7006.136898395722,
   'birth_years': [1823, 1833, 1844],
   'death_years': [1828, 1839]}),
 ('transgender',
  {'POS': 'DET',
   'max_usage': 22530.8,
   'median_all': 0.0,
   'median_in_use': 31.200000000000003,
   'mean_all': 796.3740740740741,
   'mean_in_use': 2774.4645161290323,
   'birth_years': [1947, 1958, 1971],


In [16]:
save_json(top_all,'','ALL CLOSED CLASSES TOP 500s')
save_csv(top_all,'','ALL CLOSED CLASSES TOP 500s')

SAVED:  ALL CLOSED CLASSES TOP 500s.json 1185
SAVED:  ALL CLOSED CLASSES TOP 500s.csv 1185


### Hypothesis #2: Group Instances of birth and death

In [17]:
death_lexemes, birth_lexemes = dict(), dict()

for lexeme, entries in ngrams.items():
    
    bit = len(entries['birth_years'])-len(entries['death_years'])
    
    if bit ==1:
        birth_lexemes[lexeme]=entries
    elif bit==0 or bit==-1: #was already in existence at lower time bound
        death_lexemes[lexeme]=entries
    else:
        print(bit)#,lexeme,entries)
        print()

Top Birth & Save

In [18]:
top_birth_max_usage = top_counts(birth_lexemes,'max_usage')
top_birth_median_all = top_counts(birth_lexemes,'median_all')
top_birth_median_in_use = top_counts(birth_lexemes,'median_in_use')
top_birth_mean_all = top_counts(birth_lexemes,'mean_all')
top_birth_mean_in_use = top_counts(birth_lexemes,'mean_in_use')
top_birth_all = {**top_birth_max_usage,**top_birth_median_all,**top_birth_median_in_use,**top_birth_mean_all,**top_birth_mean_in_use}
len(top_birth_all)

1189

In [19]:
list((k,v) for k,v in top_birth_all.items())[:50]

[('bbe',
  {'POS': 'ADP',
   'max_usage': 133568.4,
   'median_all': 4.8,
   'median_in_use': 5.6,
   'mean_all': 1470.3064814814813,
   'mean_in_use': 1595.910552763819,
   'birth_years': [1814, 1820, 1846],
   'death_years': [1818, 1841]}),
 ('nhe',
  {'POS': 'ADP',
   'max_usage': 75803.4,
   'median_all': 4.4,
   'median_in_use': 5.300000000000001,
   'mean_all': 971.6851851851851,
   'mean_in_use': 1093.1458333333333,
   'birth_years': [1803, 1814, 1833, 1840, 1851],
   'death_years': [1807, 1818, 1837, 1849]}),
 ('proton',
  {'POS': 'ADP',
   'max_usage': 32690.0,
   'median_all': 25.3,
   'median_in_use': 54.2,
   'mean_all': 6065.498148148148,
   'mean_in_use': 7006.136898395722,
   'birth_years': [1823, 1833, 1844],
   'death_years': [1828, 1839]}),
 ('transgender',
  {'POS': 'DET',
   'max_usage': 22530.8,
   'median_all': 0.0,
   'median_in_use': 31.200000000000003,
   'mean_all': 796.3740740740741,
   'mean_in_use': 2774.4645161290323,
   'birth_years': [1947, 1958, 1971],


In [20]:
save_json(top_birth_all,'','Birth Lexemes')
save_csv(top_birth_all,'','Birth Lexemes')

SAVED:  Birth Lexemes.json 1189
SAVED:  Birth Lexemes.csv 1189


Top Death & Save

In [21]:
top_death_max_usage = top_counts(death_lexemes,'max_usage')
top_death_median_all = top_counts(death_lexemes,'median_all')
top_death_median_in_use = top_counts(death_lexemes,'median_in_use')
top_death_mean_all = top_counts(death_lexemes,'mean_all')
top_death_mean_in_use = top_counts(death_lexemes,'mean_in_use')
top_death_all = {**top_death_max_usage,**top_death_median_all,**top_death_median_in_use,**top_death_mean_all,**top_death_mean_in_use}
len(top_death_all)

1104

In [22]:
list((k,v) for k,v in top_death_all.items())[:50]

[('anand',
  {'POS': 'CONJ',
   'max_usage': 8461.8,
   'median_all': 73.80000000000001,
   'median_in_use': 76.5,
   'mean_all': 1434.974074074074,
   'mean_in_use': 1448.385046728972,
   'birth_years': [1809],
   'death_years': [1806]}),
 ('tendon',
  {'POS': 'ADP',
   'max_usage': 8344.4,
   'median_all': 121.8,
   'median_in_use': 131.5,
   'mean_all': 890.4092592592592,
   'mean_in_use': 915.8495238095238,
   'birth_years': [1816, 1824],
   'death_years': [1812, 1820]}),
 ('yerself',
  {'POS': 'PRON',
   'max_usage': 7420.4,
   'median_all': 336.79999999999995,
   'median_in_use': 351.79999999999995,
   'mean_all': 749.3518518518518,
   'mean_in_use': 785.7281553398059,
   'birth_years': [1814],
   'death_years': [1803]}),
 ('wurde',
  {'POS': 'ADP',
   'max_usage': 5163.6,
   'median_all': 141.0,
   'median_in_use': 142.9,
   'mean_all': 422.06666666666666,
   'mean_in_use': 426.011214953271,
   'birth_years': [1813],
   'death_years': [1810]}),
 ('foer',
  {'POS': 'ADP',
   'max

In [23]:
save_json(top_death_all,'','Death Lexemes')
save_csv(top_death_all,'','Death Lexemes')

SAVED:  Death Lexemes.json 1104
SAVED:  Death Lexemes.csv 1104


### Hypothesis #3: Single Instances are more reliable than those that oscillate in and out

Birth

In [24]:
single_birth_lexemes = dict()
for lexeme, entries in birth_lexemes.items():
    if len(entries['birth_years'])==1:
        single_birth_lexemes[lexeme]=entries

In [25]:
len(single_birth_lexemes.keys())

15469

In [26]:
top_single_birth_max_usage = top_counts(single_birth_lexemes,'max_usage')
top_single_birth_median_all = top_counts(single_birth_lexemes,'median_all')
top_single_birth_median_in_use = top_counts(single_birth_lexemes,'median_in_use')
top_single_birth_mean_all = top_counts(single_birth_lexemes,'mean_all')
top_single_birth_mean_in_use = top_counts(single_birth_lexemes,'mean_in_use')
top_single_birth_all = {**top_single_birth_max_usage,**top_single_birth_median_all,**top_single_birth_median_in_use,**top_single_birth_mean_all,**top_single_birth_mean_in_use}
len(top_single_birth_all)

1095

In [27]:
list((k,v) for k,v in top_single_birth_all.items())[:50]

[('thethethethe',
  {'POS': 'DET',
   'max_usage': 18730.8,
   'median_all': 0.0,
   'median_in_use': 3.4,
   'mean_all': 425.5925925925926,
   'mean_in_use': 2484.5405405405404,
   'birth_years': [1981],
   'death_years': []}),
 ('annja',
  {'POS': 'DET',
   'max_usage': 8853.0,
   'median_all': 0.0,
   'median_in_use': 833.8000000000001,
   'mean_all': 255.88425925925927,
   'mean_in_use': 2302.9583333333335,
   'birth_years': [1994],
   'death_years': []}),
 ('andand',
  {'POS': 'CONJ',
   'max_usage': 8266.8,
   'median_all': 12.6,
   'median_in_use': 13.0,
   'mean_all': 317.7916666666667,
   'mean_in_use': 326.87142857142857,
   'birth_years': [1808],
   'death_years': []}),
 ('meself',
  {'POS': 'PRON',
   'max_usage': 8122.0,
   'median_all': 585.8,
   'median_in_use': 590.0,
   'mean_all': 910.325,
   'mean_in_use': 923.1464788732394,
   'birth_years': [1805],
   'death_years': []}),
 ('pyr',
  {'POS': 'ADP',
   'max_usage': 7774.2,
   'median_all': 9.8,
   'median_in_use': 9.

In [28]:
save_json(top_single_birth_all,'','Single Birth Lexemes')
save_csv(top_single_birth_all,'','Single Birth Lexemes')

SAVED:  Single Birth Lexemes.json 1095
SAVED:  Single Birth Lexemes.csv 1095


Death

In [29]:
single_death_lexemes = dict()
for lexeme, entries in death_lexemes.items():
    if len(entries['death_years'])==1:
        single_death_lexemes[lexeme]=entries

In [30]:
len(single_death_lexemes.keys())

13682

In [31]:
top_single_death_max_usage = top_counts(single_death_lexemes,'max_usage')
top_single_death_median_all = top_counts(single_death_lexemes,'median_all')
top_single_death_median_in_use = top_counts(single_death_lexemes,'median_in_use')
top_single_death_mean_all = top_counts(single_death_lexemes,'mean_all')
top_single_death_mean_in_use = top_counts(single_death_lexemes,'mean_in_use')
top_single_death_all = {**top_single_death_max_usage,**top_single_death_median_all,**top_single_death_median_in_use,**top_single_death_mean_all,**top_single_death_mean_in_use}
len(top_single_death_all)

988

In [32]:
list((k,v) for k,v in top_single_death_all.items())[:50]

[('anand',
  {'POS': 'CONJ',
   'max_usage': 8461.8,
   'median_all': 73.80000000000001,
   'median_in_use': 76.5,
   'mean_all': 1434.974074074074,
   'mean_in_use': 1448.385046728972,
   'birth_years': [1809],
   'death_years': [1806]}),
 ('yerself',
  {'POS': 'PRON',
   'max_usage': 7420.4,
   'median_all': 336.79999999999995,
   'median_in_use': 351.79999999999995,
   'mean_all': 749.3518518518518,
   'mean_in_use': 785.7281553398059,
   'birth_years': [1814],
   'death_years': [1803]}),
 ('wurde',
  {'POS': 'ADP',
   'max_usage': 5163.6,
   'median_all': 141.0,
   'median_in_use': 142.9,
   'mean_all': 422.06666666666666,
   'mean_in_use': 426.011214953271,
   'birth_years': [1813],
   'death_years': [1810]}),
 ('assignor',
  {'POS': 'ADP',
   'max_usage': 4165.2,
   'median_all': 947.4,
   'median_in_use': 957.8,
   'mean_all': 1173.9898148148147,
   'mean_in_use': 1190.5248826291079,
   'birth_years': [1808],
   'death_years': [1804]}),
 ('alfa',
  {'POS': 'DET',
   'max_usage':

In [33]:
save_json(top_single_death_all,'','Single Death Lexemes')
save_csv(top_single_death_all,'','Single Death Lexemes')

SAVED:  Single Death Lexemes.json 988
SAVED:  Single Death Lexemes.csv 988
