This is part of research we conduct at [Vocapouch](https://vocapouch.com). Our service is dedicated to language learners. The results of the study contained in this notebook was discribed [on our blog](https://blog.vocapouch.com/do-20-pages-of-a-book-gives-you-90-of-its-words-795a405afe70).

# Which word has the most rhymes and why it is "carburetion"?

Author: Roman Kierzkowski

In [189]:
from __future__ import print_function
from pysle import isletool
from itertools import groupby

import io

excluded_pos = {'nnp', 'nnps'} # exclude proper nouns

def extract_root_pos(pos):
    index = pos.find('_')
    return pos[:index] if index != -1 else pos

def filter_out_proper_nouns(source, dest):
    with io.open(source, "r", encoding='utf-8') as inp:
        with io.open(dest, "w", encoding='utf-8') as outp:
            for line in inp:
                s = line.find('(')
                e = line.find(')')
                pos = line[s+1:e].split(',') # extract and split
                pos = { extract_root_pos(p) for p in pos }
                if (not pos & excluded_pos) or (pos - excluded_pos): # not proper noun or proper noun that is also regular word like brown
                    outp.write(line)

filter_out_proper_nouns('ISLEdict.txt', 'ISLEdict_npn.txt')

isleDict = isletool.LexicalTool('ISLEdict_npn.txt')

In [173]:
isleDict.data['juliana']

[(u'# d\u0292 \u02ccu . l i . \u02c8\xe6 . n \u0259 #', [])]

In [190]:
def more_than_one(word):
    return '-' in word or '_' in word

def flatten(syllables):
    return [ phoneme for syllable in syllables for phoneme in syllable ]

def parse_pronun(data):
    total = len(data.keys())
    single = 0
    accented = 0
    not_vowels = 0
    
    result = []

    words = data.keys()
    words.sort()

    for word in words:
        records = data[word]
        if not more_than_one(word):
            single+=1
            was_accented = False
            not_proper_noun = ( r for r in records if 'nnp' not in r[1] or 'nn' in r[1])
            for record in not_proper_noun:
                parsed, accented_sylables, accented_vovel  = isletool._parsePronunciation(record[0])[0]
                if accented_sylables:
                    was_accented = True
                    accent_index = sum(len(x) for x in parsed[0:accented_sylables[0]]) + accented_vovel[0]
                    pronunc = flatten(parsed)
                    if pronunc[accent_index][1:] in isletool.vowelList:
                        item = (word, pronunc, accent_index)
                        result.append(item)
                    else:
                        not_vowels+=1
            if was_accented:
                accented+=1
    
    return (total, single, accented, not_vowels, result)

total, single, accented, not_vowels, pronun_records = parse_pronun(isleDict.data) 

print("Total %s words, sigle words %s, with accent %s, non-vowels accented %s." % (total, single, accented, not_vowels))

Total 206321 words, sigle words 126862, with accent 123970, non-vowels accented 0.


In [191]:
pronun_records.sort(key=lambda x: (x[0], ''.join(x[1])))
pronun_dict = dict((k, list(v)) for k, v in groupby(pronun_records, key=lambda x: x[0]))

In [246]:
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

unique_words = len(pronun_dict.keys())
average_pronun = mean([len(p) for p in pronun_dict.values()])

print("Unique words: {} Average pronunciations per word: {}".format(unique_words, average_pronun))

Unique words: 123970 Average pronunciations per word: 1.16260385577


In [192]:
def present_record(r):
    return "%s => %s (with accent at %s. phonem)" % (r[0], ''.join(r[1]), r[2] + 1)
    
print(present_record(pronun_dict['love'][0]))

love => lˈʌv (with accent at 2. phonem)


In [193]:
def same_ending(r1, r2):
    w1, p1, a1 = r1 # word, pronunciation, accent
    w2, p2, a2 = r2
    
    return p1[a1:] == p2[a2:]

def diffrent_begining(r1, r2):
    w1, p1, a1 = r1 # word, pronunciation, accent
    w2, p2, a2 = r2
    
    return p1[:a1] != p2[:a2]

def is_rhyme(r1, r2):
    w1, p1, a1 = r1 # word, pronunciation, accent
    w2, p2, a2 = r2
    
    return w1 != w2 and same_ending(r1, r2) and diffrent_begining(r1, r2)

In [195]:
love = pronun_dict['love'][0]
glove = pronun_dict['glove'][0]

assert(is_rhyme(love, glove))

uncurb = pronun_dict['uncurb'][0]
superb = pronun_dict['superb'][1]

assert(is_rhyme(uncurb, superb))

knight = pronun_dict['knight'][0]
night = pronun_dict['night'][0]

assert(not is_rhyme(knight, night))

In [196]:
def find_rhymes(records):
    result = {}
    
    records.sort(key=lambda x: list(reversed(x[1])))
    for i, record in enumerate(records):
        j = i + 1
        while j < len(records) and same_ending(record, records[j]):
            r1 = record
            r2 = records[j]
            if is_rhyme(r1, r2):
                w1 = r1[0]
                w2 = r2[0]
                result.setdefault(w1, set({})).add(w2)
                result.setdefault(w2, set({})).add(w1)
            j+=1
    return result

rhymes = find_rhymes(pronun_records)

In [197]:
rhymes['love']

{u'above',
 u'belove',
 u'deneuve',
 u'dove',
 u'glove',
 u'gov',
 u'hereof',
 u"o'glove",
 u'shove',
 u'thereof',
 u'whereof'}

In [199]:
assert('night'  in rhymes['height'])
assert('knight' in rhymes['height'])

assert('night'  not in rhymes['knight'])
assert('knight' not in rhymes['night'])

In [235]:
rhymes_counts = [ (word, len(word_rhymes)) for word, word_rhymes in rhymes.iteritems() ]
rhymes_counts.sort(key=lambda x: -x[1])

rhymes_counts[0:10]

[(u'carburetion', 1400),
 (u'modernization', 1390),
 (u'obligation', 1382),
 (u'ration', 1381),
 (u'ventilation', 1380),
 (u'distillation', 1380),
 (u'ordination', 1378),
 (u'concatenation', 1378),
 (u'incoordination', 1378),
 (u'detonation', 1378)]

In [234]:
list(rhymes['carburetion'])[0:10]

[u'expostulation',
 u'activation',
 u'dotation',
 u'replication',
 u'appropriation',
 u'gratification',
 u'disorientation',
 u'reduplication',
 u'ovation',
 u'accentuation']

In [200]:
def pronunc_hist(pronunciations, word):
    word_pronunciations = pronunciations[word]
    result = [0] * len(word_pronunciations)
    count = [ set({}) for _ in word_pronunciations ]
    
    
    for r in rhymes[word]:
        for i, lead in enumerate(word_pronunciations):
            for p in pronunciations[r]:
                if is_rhyme(lead, p):
                    result[i]+=1
                    count[i].add(r)
                    
    return (word_pronunciations, [len(c) for c in count])
    
       
def print_pronunc_hist(pronunciations, word):        
    word_pronunciations, counts = pronunc_hist(pronunciations, word)

    for i, p in enumerate(word_pronunciations):
        print(present_record(p))
        print(counts[i])


print_pronunc_hist(pronun_dict, 'carburetion')
print('-------')
print_pronunc_hist(pronun_dict, 'modernization')

carburetion => kˌɑɹbjɚˈiʃn̩ (with accent at 7. phonem)
29
carburetion => kˌɑɹbəɹˈeiʃn̩ (with accent at 7. phonem)
1371
-------
modernization => mˌɑd˺ɚnɑɪzˈeiʃə (with accent at 8. phonem)
19
modernization => mˌɑd˺ɚnəzˈeiʃn̩ (with accent at 8. phonem)
1371


In [202]:
def ending(record):
    _, pronunciation, accent = record
    
    return ''.join(pronunciation[accent:])

def rhyme_groups(records):
    result = {}
    records.sort(key=lambda x: list(reversed(x[1])))
     
    for i, record in enumerate(records):
        word = record[0]
        result.setdefault(ending(record), set({})).add(word)
        
    return result

groups = rhyme_groups(pronun_records)

In [256]:
from random import shuffle

groups_list = [ (e, len(words)) for e, words in groups.iteritems() ]
groups_list.sort(key=lambda x: -x[1])

for e, c in groups_list[:20]:
    sample = list(groups[e])
    shuffle(sample)
    print(u'{:<10} {:>5} {}'.format(e.strip(), c, sample[:5]))

ˈeiʃn̩      1372 [u'gemmulation', u'croatian', u'socialization', u'intimidation', u'accumulation']
ˈi           463 [u'advisee', u'expiree', u'malmedy', u'sadi', u'cheville']
ˈei          446 [u'hogmanay', u'stray', u'palais', u'dache', u'yay']
ˈɑlədʒi      359 [u'hieroglyphology', u'ideology', u'insectology', u'otology', u'ology']
ˈɛt          347 [u'charrette', u'tete', u'chalmette', u'racette', u'chevette']
ˈeiʃn̩z      285 [u'eliminations', u'originations', u'reverberations', u'explanations', u'incorporations']
ˈu           283 [u'hasanlu', u'pew', u'unglue', u'tobu', u'mcknew']
ˈin          276 [u'fifteen', u'mousseline', u'lopatin', u'tessin', u'tambourine']
ˈoʊsɪs       267 [u'exostosis', u'cleptobiosis', u'actinobacillosis', u'mimosis', u'diorthosis']
ˈɪt˺ɪk       265 [u'melanitic', u'glycolytic', u'antineuritic', u'erythrocytic', u'pisolitic']
ˈæt˺ɪk       250 [u'electrostatic', u'fungistatic', u'panchromatic', u'orthochromatic', u'sterigmatic']
ˈɛt˺ɪk       244 [u'psychogenet