In [2]:
import nltk
from nltk.corpus import stopwords

In [3]:
stop_words = stopwords.words('English')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [4]:
# Define function that prints list of unusual words
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

In [5]:
print(nltk.corpus.movie_reviews.fileids()[:5])
unusual_words(nltk.corpus.movie_reviews.words('neg/cv000_29416.txt'))[:10]

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt']


['actors',
 'apparitions',
 'bentley',
 'characters',
 'confusing',
 'continues',
 'couples',
 'didn',
 'dies',
 'disappearances']

In [6]:
unusual_words(nltk.corpus.nps_chat.words())[:10]

['aaaaaaaaaaaaaaaaa',
 'aaahhhh',
 'abortions',
 'abou',
 'abourted',
 'abs',
 'ack',
 'acros',
 'actualy',
 'adams']

In [7]:
# Counts fraction of not stopwords in given text
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

In [8]:
content_fraction(nltk.corpus.movie_reviews.words('neg/cv000_29416.txt'))

0.5642775881683731

In [9]:
# Creates list of words using given conditions
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r' # Obligatory letter
wordlist = nltk.corpus.words.words()
[w for w in wordlist if len(w) >= 5 # Length of word
    and obligatory in w [2]
    and nltk.FreqDist(w) <= puzzle_letters]

['enrol', 'gorlin', 'norie', 'vergi', 'vireo', 'virole', 'viron']

In [10]:
# List of names with the gender split
names = nltk.corpus.names
names.fileids()

['female.txt', 'male.txt']

In [11]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names][:5]

['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian']

In [12]:
# Creates function to plot list of names with the gender split
# Interesting fact: names ending in the letter "a" are almost always female
cfd = nltk.ConditionalFreqDist(
    (fileid, name[-1])
    for fileid in names.fileids()
    for name in names.words(fileid))

In [13]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,6)

cfd.plot()

<Figure size 1500x600 with 1 Axes>

In [14]:
# List of phonetic codes - phones
# https://en.wikipedia.org/wiki/ARPABET
entries = nltk.corpus.cmudict.entries()
print(entries[110:115])

[('abducted', ['AH0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'D']), ('abductee', ['AE0', 'B', 'D', 'AH2', 'K', 'T', 'IY1']), ('abductees', ['AE0', 'B', 'D', 'AH2', 'K', 'T', 'IY1', 'Z']), ('abducting', ['AE0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'NG']), ('abducting', ['AH0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'NG'])]


In [15]:
# This loop does pretty the same thing
# Function prints out word and list of its phones
for entry in entries[110:115]:
    print(entry)

('abducted', ['AH0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'D'])
('abductee', ['AE0', 'B', 'D', 'AH2', 'K', 'T', 'IY1'])
('abductees', ['AE0', 'B', 'D', 'AH2', 'K', 'T', 'IY1', 'Z'])
('abducting', ['AE0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'NG'])
('abducting', ['AH0', 'B', 'D', 'AH1', 'K', 'T', 'IH0', 'NG'])


In [18]:
# Select words that ends with certain syllables
syllable = ['N', 'IH0', 'K', 'S']
[word for word, pron in entries if pron[-4:] == syllable][:5]

["atlantic's", 'audiotronics', 'avionics', 'beatniks', 'calisthenics']

In [25]:
prondict = nltk.corpus.cmudict.dict()
prondict['fire']

[['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]

In [27]:
text = ['natural', 'language', 'processing']
print([ph for w in text for ph in prondict[w][0]])

['N', 'AE1', 'CH', 'ER0', 'AH0', 'L', 'L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH', 'P', 'R', 'AA1', 'S', 'EH0', 'S', 'IH0', 'NG']


In [32]:
# Comparative Wordlists
# Swadesh list of about 200 common words in several languages
from nltk.corpus import swadesh
print(swadesh.fileids())

['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk']


In [35]:
print(swadesh.words('ca'))

['jo', 'tu', 'ell', 'nosaltres', 'vosaltres', 'ells, elles', 'aquest', 'aquell', 'aquí', 'allà', 'qui', 'què', 'on', 'quan', 'com', 'no', 'tot', 'molt', 'algun, una mica', 'poc', 'altre', 'un', 'dos', 'tres', 'quatre', 'cinc', 'gran, gros', 'llarg', 'ample', 'gruixut', 'pesat', 'petit', 'curt', 'estret', 'prim', 'dona', 'home', 'persona', 'nen', 'muller', 'marit', 'mare', 'pare', 'animal', 'peix', 'ocell', 'gos', 'poll', 'serp', 'cuc', 'arbre', 'bosc', 'bastó', 'fruit, fruita', 'llavor', 'fulla', 'arrel', 'escorça', 'flor', 'herba', 'corda', 'pell', 'carn', 'sang', 'os', 'gras', 'ou', 'banya', 'cua', 'ploma', 'pèl, cabell', 'cap', 'orella', 'ull', 'nas', 'boca', 'dent', 'llengua', 'ungla', 'peu', 'cama', 'genoll', 'ma', 'ala', 'panxa', 'budells', 'coll', 'esquena', 'pit', 'cor', 'fetge', 'beure', 'menjar', 'mossegar', 'xuclar, llepar', 'escopir', 'vomitar', 'bufar', 'respirar', 'riure', 'veure', 'escoltar', 'saber', 'pensar', 'olorar', 'témer', 'dormir', 'viure', 'morir', 'matar', 'llu

In [42]:
fr2en = swadesh.entries(['be'])
print(fr2en)

[('я',), ('ты',), ('ён',), ('мы',), ('вы',), ('яны',), ('гэта',), ('гэны',), ('тут',), ('там',), ('хто',), ('што',), ('дзе',), ('калі',), ('як',), ('не',), ('усе',), ('шмат',), ('некалькі',), ('няшмат',), ('іншы',), ('адзін',), ('два',), ('тры',), ('чатыры',), ('пяць',), ('вялікі',), ('доўгі',), ('шырокі',), ('тоўсты',), ('цяжкі',), ('маленькі',), ('кароткі',), ('вузкі',), ('тонкі',), ('жанчына',), ('мужчына',), ('чалавек',), ('дзіця',), ('жонка',), ('муж',), ('маці',), ('бацька',), ('зьвер',), ('рыба',), ('птушка',), ('пёс',), ('вош',), ('зьмяя',), ('чарвяк',), ('дрэва',), ('лес',), ('палка',), ('садавіна',), ('насеньне',), ('ліст',), ('корань',), ('кара',), ('кветка',), ('трава',), ('шнур',), ('скура',), ('мяса',), ('кроў',), ('костка',), ('тлушч',), ('яйка',), ('рог',), ('хвост',), ('пяро',), ('валасы',), ('галава',), ('вуха',), ('вока',), ('нос',), ('вусны',), ('зуб',), ('язык',), ('пазногаць',), ('ступня',), ('нага',), ('калена',), ('рука',), ('крыло',), ('жывот',), ('вантробы',),

In [47]:
# Comparation of words in various Germanic and Romance languages
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142, 143]:
    print(swadesh.entries(languages)[i])

('say', 'sagen', 'zeggen', 'decir', 'dire', 'dizer', 'dicere')
('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')
('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')
('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')
('flow', 'fließen', 'vloeien', 'fluir', 'couler', 'fluir', 'fluere')


In [57]:
# Shoebox and Toolbox Lexicons
from nltk.corpus import toolbox
print(toolbox.entries('rotokas.dic')[:2])


[('kaa', [('ps', 'V'), ('pt', 'A'), ('ge', 'gag'), ('tkp', 'nek i pas'), ('dcsv', 'true'), ('vx', '1'), ('sc', '???'), ('dt', '29/Oct/2005'), ('ex', 'Apoka ira kaaroi aioa-ia reoreopaoro.'), ('xp', 'Kaikai i pas long nek bilong Apoka bikos em i kaikai na toktok.'), ('xe', 'Apoka is gagging from food while talking.')]), ('kaa', [('ps', 'V'), ('pt', 'B'), ('ge', 'strangle'), ('tkp', 'pasim nek'), ('arg', 'O'), ('vx', '2'), ('dt', '07/Oct/2006'), ('ex', 'Rera rauroro rera kaarevoi.'), ('xp', 'Em i holim pas em na nekim em.'), ('xe', 'He is holding him and strangling him.'), ('ex', 'Iroiro-ia oirato okoearo kaaivoi uvare rirovira kaureoparoveira.'), ('xp', 'Ol i pasim nek bilong man long rop bikos em i save bikhet tumas.'), ('xe', "They strangled the man's neck with rope because he was very stubborn and arrogant."), ('ex', 'Oirato okoearo kaaivoi iroiro-ia. Uva viapau uvuiparoi ra vovouparo uva kopiiroi.'), ('xp', 'Ol i pasim nek bilong man long rop. Olsem na em i no pulim win olsem na em 