# Nation's word lists

Nov 1, 2018

In [0]:
from zipfile import ZipFile
import re

import numpy as np

import matplotlib.pylab as plt
%config InlineBackend.figure_format = 'retina'

In [2]:
import nltk
nltk.download('book', quiet=True)

True

In [0]:
def normalize(words):
  return [w.lower() for w in words if w.isalpha()]

In [0]:
moby = normalize(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
paradise = normalize(nltk.corpus.gutenberg.words('milton-paradise.txt'))
stories = normalize(nltk.corpus.gutenberg.words('bryant-stories.txt'))

Download Nation's wordlist from the web

In [5]:
!wget http://www.laurenceanthony.net/resources/wordlists/bnc_coca_cleaned_ver_002_20141015.zip

--2018-11-01 17:26:43--  http://www.laurenceanthony.net/resources/wordlists/bnc_coca_cleaned_ver_002_20141015.zip
Resolving www.laurenceanthony.net (www.laurenceanthony.net)... 69.195.124.184
Connecting to www.laurenceanthony.net (www.laurenceanthony.net)|69.195.124.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 323577 (316K) [application/zip]
Saving to: ‘bnc_coca_cleaned_ver_002_20141015.zip’


2018-11-01 17:26:43 (2.49 MB/s) - ‘bnc_coca_cleaned_ver_002_20141015.zip’ saved [323577/323577]



Read in word list, creating three structures: 

+ `vocab[level]` is a set of word types at a given level (1=1st 1000, 2=2nd 1000, etc)
+ `baseword[w]` is the baseword of the word family containing `w`
+ `vocab_words` is a set of all the word types in the vocabulary

In [0]:
nation = ZipFile('bnc_coca_cleaned_ver_002_20141015.zip')

vocab = dict()
vocab_words = set()
baseword = dict()
for name in nation.namelist():
  if name.endswith('.txt'):
    level = int(name[name.index('/')+8:-4])
    vocab[level] = set()
    with nation.open(name) as f:
      for line in f:
        line = line.decode('utf-8').lower()
        word = line.strip()
        if not line.startswith('\t'):
           base = word
        baseword[word] = base
        vocab[level].add(word)
    vocab_words.update(vocab[level]) 

Generate a profile of a corpus

In [0]:
def profile(corpus):

  tokens = list(corpus)
  t_tokens = len(tokens)

  types = set(tokens)
  t_types = len(types)
  
  t_families = len(set(baseword[t] for t in types if t in baseword))
  
  c_tokens = 0.0
  c_types = 0.0
  c_families = 0.0

  print(f'{"LV":2}      '
      f'{"TOKS":>7} {"%TOKS":>6} {"C%TOKS":>6}      '   
      f'{"TYPS":>7} {"%TYPS":>6} {"C%TYPS":>6}      '   
      f'{"FAMS":>7} {"%FAMS":>6} {"C%FAMS":>6}')   
  
  for level in sorted(list(vocab)):

    l_types = types.intersection(vocab[level])   
    n_tokens = len([w for w in tokens if w in vocab[level]])
    n_types = len(set(tokens).intersection(vocab[level]))
    n_families = len(set(baseword[t] for t in l_types))

    p_tokens = n_tokens/t_tokens*100
    p_types = n_types/t_types*100
    p_families = n_families/t_families*100

    c_tokens += p_tokens
    c_types += p_types
    c_families += p_families
    
    print(f'{level:2}      '
          f'{n_tokens:7,} {p_tokens:6.2f} {c_tokens:6.2f}      '   
          f'{n_types:7,} {p_types:6.2f} {c_types:6.2f}      '
          f'{n_families:7,} {p_families:6.2f} {c_families:6.2f}')        

In [8]:
profile(moby)

LV         TOKS  %TOKS C%TOKS         TYPS  %TYPS C%TYPS         FAMS  %FAMS C%FAMS
 1      165,014  75.57  75.57        3,149  18.58  18.58          965  12.41  12.41
 2       14,359   6.58  82.15        2,403  14.18  32.76          900  11.58  23.99
 3        6,765   3.10  85.24        1,728  10.20  42.95          765   9.84  33.83
 4        7,903   3.62  88.86        1,436   8.47  51.43          689   8.86  42.69
 5        3,831   1.75  90.62        1,083   6.39  57.82          599   7.71  50.40
 6        3,048   1.40  92.01          903   5.33  63.15          519   6.68  57.07
 7        1,915   0.88  92.89          691   4.08  67.22          442   5.69  62.76
 8        1,668   0.76  93.65          528   3.12  70.34          360   4.63  67.39
 9        1,234   0.57  94.22          477   2.81  73.15          329   4.23  71.62
10          908   0.42  94.63          396   2.34  75.49          296   3.81  75.43
11          808   0.37  95.00          271   1.60  77.09          203   2.61

In [9]:
profile(paradise)

LV         TOKS  %TOKS C%TOKS         TYPS  %TYPS C%TYPS         FAMS  %FAMS C%FAMS
 1       56,486  70.18  70.18        2,007  22.36  22.36          817  17.99  17.99
 2        7,302   9.07  79.25        1,269  14.14  36.50          609  13.41  31.40
 3        2,995   3.72  82.97          863   9.62  46.12          460  10.13  41.52
 4        3,756   4.67  87.63          711   7.92  54.04          411   9.05  50.57
 5        1,705   2.12  89.75          503   5.60  59.64          332   7.31  57.88
 6        1,249   1.55  91.30          418   4.66  64.30          283   6.23  64.11
 7          962   1.20  92.50          355   3.96  68.26          249   5.48  69.59
 8          728   0.90  93.40          245   2.73  70.99          180   3.96  73.56
 9          594   0.74  94.14          239   2.66  73.65          186   4.10  77.65
10          454   0.56  94.71          184   2.05  75.70          157   3.46  81.11
11          372   0.46  95.17          126   1.40  77.10          108   2.38

In [10]:
profile(stories)

LV         TOKS  %TOKS C%TOKS         TYPS  %TYPS C%TYPS         FAMS  %FAMS C%FAMS
 1       40,550  87.00  87.00        1,692  44.22  44.22          840  35.07  35.07
 2        2,144   4.60  91.60          696  18.19  62.42          468  19.54  54.61
 3          426   0.91  92.51          210   5.49  67.90          168   7.01  61.63
 4          713   1.53  94.04          286   7.48  75.38          215   8.98  70.61
 5          578   1.24  95.28          217   5.67  81.05          173   7.22  77.83
 6          266   0.57  95.85          133   3.48  84.53          112   4.68  82.51
 7          177   0.38  96.23           91   2.38  86.91           74   3.09  85.59
 8          176   0.38  96.61           52   1.36  88.26           48   2.00  87.60
 9          124   0.27  96.87           50   1.31  89.57           46   1.92  89.52
10           93   0.20  97.07           31   0.81  90.38           25   1.04  90.56
11           52   0.11  97.19           20   0.52  90.90           18   0.75