# Tutorial 01 & 02

In [21]:
#load the necessary library
# !pip install nltk
import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize
from nltk.book import *
# nltk.download('punkt') # tokenizer models
# nltk.download('book')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [7]:
custom_text = "Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world.It is a way I have of driving off the spleen and regulating the circulation"

In [16]:
# convert the paragraph into a list of words

tokens = word_tokenize(custom_text) # output - list type
text_custom = Text(tokens) # <Text: Call me Ishmael . Some years ago—never mind...>

Displaying 1 of 1 matches:
 Call me Ishmael . Some years ago—never mi


In [19]:
text_custom.concordance('Call') # count the Call word
print(f'\n Vocabulary Size : {len(set(tokens))}')
# custom_text.count('Call')

Displaying 1 of 1 matches:
 Call me Ishmael . Some years ago—never mi

 Vocabulary Size : 46


In [22]:
print(text3)

<Text: The Book of Genesis>


In [24]:
print(f'Total Tokens : {len(text3)} and Unique Tokens : {len(set(text3))}')
print(f'Lexical Diversity : {len(set(text3))/len(text3)}')

Total Tokens : 44764 and Unique Tokens : 2789
Lexical Diversity : 0.06230453042623537


# Tutorial 03

In [26]:
# !pip install epitran - convert word to IPA (International Phonetic Alphabet)
# !pip install panphon - to work with sounds of words written in IPA
# !pip install pronouncing - work with the CMU Pronouncing Dictionary

import pronouncing

In [28]:
words = ['phonetics', 'phonology', 'morphology', 'analysis', 'transcription']

transcription = {}

for w in words:
    phones = pronouncing.phones_for_word(w)
    if phones :
        transcription[w] = phones[0]
    else:
        transcription[w] = 'No trancription found'
print(transcription)

{'phonetics': 'F AH0 N EH1 T IH0 K S', 'phonology': 'F AH0 N AA1 L AH0 JH IY2', 'morphology': 'M AO0 R F AA1 L AH0 JH IY0', 'analysis': 'AH0 N AE1 L AH0 S AH0 S', 'transcription': 'T R AE2 N S K R IH1 P SH AH0 N'}


### Phonome Frequency Analysis

In [29]:
from collections import Counter

In [31]:
# combine all transcription into a single string(without spaces)
all_phones = ''.join([t.replace(' ','') for t in transcription.values() if t != 'No transcription found'])

phoneme_counts = Counter(all_phones)

print(phoneme_counts)

Counter({'A': 15, 'H': 14, '0': 11, 'N': 5, '1': 5, 'S': 5, 'I': 4, 'F': 3, 'E': 3, 'L': 3, 'R': 3, 'T': 2, 'K': 2, 'J': 2, 'Y': 2, '2': 2, 'M': 1, 'O': 1, 'P': 1})


### Stress Pattern analysis with CMU dictionary

In [34]:
nltk.download('cmudict')
from nltk.corpus import cmudict

[nltk_data] Downloading package cmudict to C:\Users\The
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [35]:
d = cmudict.dict()
stress_patterns = {}

for w in words:
    if w in d:
        transcription = d[w][0]
        pattern = ''.join(['1' if '1' in ph else '0' for ph in transcription])
        stress_patterns[w] = pattern
        
print(stress_patterns)

{'phonetics': '00010000', 'phonology': '00010000', 'morphology': '000010000', 'analysis': '00100000', 'transcription': '000000010000'}
