In [519]:
import pronouncing
from itertools import product
import functools
import operator
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm_notebook as tqdm
import string
import gensim
from multiprocessing import Pool

In [520]:
model = gensim.models.KeyedVectors.load('/mnt/bigfiles/dl/datasets/gensim-fasttext-300d-2M', mmap='r')

In [3]:
def remove_punctuation(phrase): return phrase.translate(str.maketrans({a:None for a in string.punctuation}))

In [362]:
stop = set(stopwords.words('english'))

INCLUDE_STOP = True
def synonyms(word):
    if not INCLUDE_STOP and word in stop:
        return [word]
    
    most_similar = [word] + [data[0].lower() for data in model.most_similar(positive=[word.lower()], topn=50)]
    only_pronouncable = [word for word in most_similar if len(pronouncing.phones_for_word(word)) > 0]
    
    return list(set(only_pronouncable))[:20]

synonyms('slow')

  if np.issubdtype(vec.dtype, np.int):


['slow',
 'slowing',
 'sluggish',
 'rapid',
 'slowed',
 'slower',
 'fast',
 'slows',
 'steady',
 'slowness',
 'plodding',
 'slowest']

In [109]:
def get_meter(line):
    line = remove_punctuation(line) 
    syllables = 0 
    stresses = []
    for word in word_tokenize(line):
        phonemes = pronouncing.phones_for_word(word)[0]
        syllables += pronouncing.syllable_count(phonemes)
        stresses.append(list(map(int, pronouncing.stresses(phonemes))))

    return syllables, stresses

In [110]:
get_meter('that was amazing')

(5, [[1], [1], [0, 1, 0]])

In [376]:
def check_meter(line, meter, partial=False, start=0): # meter is an array with False for unstressed, True for stressed
    line = remove_punctuation(line) 
    cycle_len = len(meter)
    meter_step = start
    
    _, stresses = get_meter(line)
    
    for stress in stresses:
        if len(stress) == 1:
            meter_step += 1
            continue # one syllable words can be stressed or not stressed
        
        for syllable in stress:
            if syllable == 2:
                meter_step += 1
                continue # lightly stressed, can be either
                
            if (syllable == 1) == meter[meter_step % cycle_len]: 
                meter_step += 1
                continue
            else:
                return False if not partial else (False, meter_step % cycle_len)
            
    if not partial:
        if meter_step % cycle_len == 0: # end at the correct step
            return True if not partial else (True, meter_step % cycle_len)
        else:
            return False if not partial else (False, meter_step % cycle_len)
    else:
        return True if not partial else (True, meter_step % cycle_len)

In [377]:
iambic = [False, True]
trochaic = [True, False]
dactylic = [True, False, False]
dactylic_hexameter = [True, False, False] * 5 + [True, False]

In [378]:
print(check_meter('How do I love thee? Let me count the ways.', iambic))
print(check_meter('Is this smart enough to catch bad meter?', iambic))
print(check_meter('Once upon a midnight dreary, while I pondered, weak and weary', trochaic))
print(check_meter('This is the forest primeval, the murmuring pines and the hemlock', dactylic_hexameter))

True
False
True
True


In [86]:
test_poem = \
'''How do I love thee? Let me count the ways.
I love thee to the depth and breadth and height
My soul can reach, when feeling out of sight
For the ends of being and ideal grace.
I love thee to the level of every day's
Most quiet need, by sun and candle-light.
I love thee freely, as men strive for right.
I love thee purely, as they turn from praise.
I love thee with the passion put to use
In my old griefs, and with my childhood's faith.
I love thee with a love I seemed to lose
With my lost saints. I love thee with the breath,
Smiles, tears, of all my life; and, if God choose,
I shall but love thee better after death.
'''.split('\n')
for line in test_poem:
    try:
        if not check_meter(line, iambic):
            print(line)
    except:
        pass

For the ends of being and ideal grace.
I love thee to the level of every day's


In [427]:
# https://stackoverflow.com/questions/32074543/how-to-get-the-length-of-an-itertools-product
def product_length(items): return functools.reduce(operator.mul, map(len, items), 1)

# WIP
def make_chunk_fit_meter(words, meter, start=0, return_early=10):
    correct = []
    word_synonyms = list(map(synonyms, words))
                
    for possible in tqdm(product(*word_synonyms), total=product_length(word_synonyms)):
        potential = ' '.join(possible)
        try:
            if check_meter(potential, meter, start=start):
                correct.append(potential)
#                 print(potential)

                if return_early and len(correct) >= return_early:
                    return correct
        except IndexError:
            continue
            
#     print(correct)
    return correct

In [433]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [587]:
class Chunk:
    def __init__(self, start, end, meter_state, correct):
        self.start = start
        self.end = end
        self.meter_state = meter_state
        self.correct = correct

In [599]:
CHUNK_MIN_WORDS = 3

def fit_meter(line, meter, return_early=10):
    words = line.split(' ') # word_tokenize(line)
    try:
        if check_meter(line, meter):
            return [line]
    except IndexError:
        print('Some words not pronouncable')
        return []
    
    final = []
    last_end = 0
    while len(final) == 0 or last_end == len(words):
        longest = None
        start = 0
        if len(final) > 0:
            start = final[-1].end
            
        for i in range(start, len(words) - CHUNK_MIN_WORDS):
            result = False
            last_continue = 0

            for j in range(i + CHUNK_MIN_WORDS, len(words) + 1):
                chunk = words[i:j]
                next_result, continuePoint = check_meter(' '.join(chunk), meter, partial=True, start=last_continue)
                
                if not result and next_result:
                    final.append(Chunk(last_end, i, None, False))
                result = next_result

                if result:
                    last_continue = continuePoint
                    longest = Chunk(i, j, continuePoint, True)
                else:
                    break

            if longest is not None:
                final.append(longest)
                last_end = longest.end
                break
                
    if final[-1].end < len(words) - 1:
        final.append(Chunk(final[-1].end, len(words), None, False))
                
    possibilities = []
    for item in final:
        print(words[item.start:item.end])
#     possibilities = [[' '.join(correct)]] 
#     if len(incorrect) > 0:
#         possibilities.append(make_chunk_fit_meter(incorrect, meter, start=continuePoint, return_early=return_early))
           
#     return possibilities

In [600]:
INCLUDE_STOP = True
to_iambic = fit_meter("disgusting the quick brown fox jumped over the lazy dog", dactylic)
to_iambic

['disgusting']
['the', 'quick', 'brown', 'fox', 'jumped']
['over', 'the', 'lazy', 'dog']


In [415]:
corpus = word_tokenize(open('sermon_mount_NIV.txt').read())