In [1]:
import pronouncing
from itertools import product
import functools
import operator
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm_notebook as tqdm
import string
import gensim
from multiprocessing import Pool
from random import choice
import textstat

In [2]:
model = gensim.models.KeyedVectors.load('/mnt/bigfiles/dl/datasets/gensim-fasttext-300d-2M', mmap='r')

In [3]:
def remove_punctuation(phrase): return phrase.translate(str.maketrans({a:' ' for a in string.punctuation}))

In [4]:
stop = set(stopwords.words('english'))

INCLUDE_STOP = True
def synonyms(word):
    if not INCLUDE_STOP and word in stop:
        return [word]
    
    most_similar = [word] + [data[0].lower().replace(' ', '-') for data in model.most_similar(positive=[word.lower()], topn=50)]
    only_pronouncable = [word for word in most_similar if len(word) > 0 and len(pronouncing.phones_for_word(word)) > 0]
    
    return list(set(only_pronouncable))[:20]

synonyms('slow')

  if np.issubdtype(vec.dtype, np.int):


['steady',
 'fast',
 'slowed',
 'rapid',
 'slows',
 'slowing',
 'slower',
 'slowness',
 'slow',
 'plodding',
 'sluggish',
 'slowest']

In [5]:
def get_meter(line):
    line = remove_punctuation(line) 
    syllables = 0 
    stresses = []
    for word in word_tokenize(line):
        phonemes = pronouncing.phones_for_word(word)[0]
        syllables += pronouncing.syllable_count(phonemes)
        stresses.append(list(map(int, pronouncing.stresses(phonemes))))

    return syllables, stresses

In [6]:
get_meter('that was amazing')

(5, [[1], [1], [0, 1, 0]])

In [7]:
def check_meter(line, meter, partial=False, start=0): # meter is an array with False for unstressed, True for stressed
    line = remove_punctuation(line) 
    cycle_len = len(meter)
    meter_step = start
    
    _, stresses = get_meter(line)
    
    for stress in stresses:
        if len(stress) == 1:
            meter_step += 1
            continue # one syllable words can be stressed or not stressed
        
        for syllable in stress:
            if syllable == 2:
                meter_step += 1
                continue # lightly stressed, can be either
                
            if (syllable == 1) == meter[meter_step % cycle_len]: 
                meter_step += 1
                continue
            else:
                return False if not partial else (False, meter_step % cycle_len)
            
    if not partial:
        if meter_step % cycle_len == 0: # end at the correct step
            return True if not partial else (True, meter_step % cycle_len)
        else:
            return False if not partial else (False, meter_step % cycle_len)
    else:
        return True if not partial else (True, meter_step % cycle_len)

In [8]:
def find_start(line, meter):
    for start in range(meter):
        worked, state = check_meter(line, meter, partial=True, start=start)
        if worked:
            return start, state
    return None

In [9]:
iambic = [False, True]
trochaic = [True, False]
dactylic = [True, False, False]
dactylic_hexameter = dactylic * 5 + [True, False]

In [10]:
print(check_meter('How do I love thee? Let me count the ways.', iambic))
print(check_meter('Is this smart enough to catch bad meter?', iambic))
print(check_meter('Once upon a midnight dreary, while I pondered, weak and weary', trochaic))
print(check_meter('This is the forest primeval, the murmuring pines and the hemlock', dactylic_hexameter))

True
False
True
True


In [11]:
test_poem = \
'''How do I love thee? Let me count the ways.
I love thee to the depth and breadth and height
My soul can reach, when feeling out of sight
For the ends of being and ideal grace.
I love thee to the level of every day's
Most quiet need, by sun and candle-light.
I love thee freely, as men strive for right.
I love thee purely, as they turn from praise.
I love thee with the passion put to use
In my old griefs, and with my childhood's faith.
I love thee with a love I seemed to lose
With my lost saints. I love thee with the breath,
Smiles, tears, of all my life; and, if God choose,
I shall but love thee better after death.
'''.split('\n')
for line in test_poem:
    try:
        if not check_meter(line, iambic):
            print(line)
    except:
        pass

For the ends of being and ideal grace.
I love thee to the level of every day's


In [12]:
# https://stackoverflow.com/questions/32074543/how-to-get-the-length-of-an-itertools-product
def product_length(items): return functools.reduce(operator.mul, map(len, items), 1)

def make_chunk_fit_meter(words, meter, start=0, return_early=10):
    correct = []
    word_synonyms = list(map(synonyms, words))
                
    for possible in product(*word_synonyms):
        try:
            result = check_meter(' '.join(possible), meter, start=start)
        except:
            continue
            
        if result:
            correct.append(list(possible))

            if return_early and len(correct) >= return_early:
                break
            
    if len(correct) > 0:
        return min(correct, key=lambda item: textstat.flesch_reading_ease(' '.join(item)))
    else:
        return ''

In [13]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [14]:
class Chunk:
    def __init__(self, start, end, correct):
        self.start = start
        self.end = end
        self.correct = correct

In [15]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def partition(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
CHUNK_MAX_WORDS = 12
def clean_chunks(chunks):
    cleaned = []
    
    for chunk in chunks:
        if chunk.correct == False and (chunk.end - chunk.start) >= CHUNK_MAX_WORDS:
            chunklets = []
            indices = partition(range(chunk.start, chunk.end), CHUNK_MAX_WORDS - 2)
            for i in indices:
                chunklets.append(Chunk(i[0], i[-1] + 1, False))
            cleaned += chunklets
        else:
            cleaned.append(chunk)
            
    return cleaned

In [16]:
CHUNK_MIN_WORDS = 3

def fit_meter(line, meter, return_early=10):
    words = remove_punctuation(line).split(' ') # word_tokenize(line)
    words = [word for word in words if len(word) is not 0]
    try:
        if check_meter(line, meter):
            return [line]
    except IndexError:
        print('Some words not pronouncable')
        return ''
    
    final = []
    last_end = 0
    while len(final) == 0 or last_end == len(words):
        longest = None
        start = 0
        if len(final) > 0:
            start = final[-1].end
            
        for i in range(start, len(words) - CHUNK_MIN_WORDS):
            result = False

            for j in range(i + CHUNK_MIN_WORDS, len(words) + 1):
                chunk = words[i:j]
                next_result, _ = check_meter(' '.join(chunk), meter, partial=True, start=0) # todo
                
                if next_result and not result:
                    final.append(Chunk(last_end, i, False))
                    
                result = next_result

                if result:
                    longest = Chunk(i, j, True)
                else:
                    break

            if longest is not None:
                final.append(longest)
                last_end = longest.end
                break
            else:
                print('ooops')
                final.append(Chunk(i, len(words) - 1, False))
                last_end = len(words) - 1
                break
                
    if final[-1].end < len(words) - 1:
        final.append(Chunk(final[-1].end, len(words), False))
    
    final = clean_chunks(final)
                
    sentence = []
    meter_state = 0
    for i, chunk in enumerate(tqdm(final)):
        if chunk.start == chunk.end:
            continue
            
        str_version = words[chunk.start:chunk.end]
        
        if chunk.correct:
            sentence.append(' '.join(str_version))
        else:
            str_version = make_chunk_fit_meter(str_version, meter, start=meter_state, return_early=return_early)
            if len(str_version) == 0:
                print('Failed')
                return ''
            sentence.append(' '.join(str_version))
            
        result, meter_state = check_meter(' '.join(str_version), meter, start=meter_state, partial=True)
        assert result
           
    return ' '.join(sentence)

In [17]:
INCLUDE_STOP = True
to_iambic = fit_meter("the quick brown fox jumped over the lazy dog", dactylic, return_early=False)
to_iambic

  if np.issubdtype(vec.dtype, np.int):





'the quick brown fox jumped up that lackadaisical rottweiler'

In [None]:
corpus = '''"Blessed are the poor in spirit, for theirs is the kingdom of heaven. Blessed are those who mourn, for they will be comforted. Blessed are the meek, for they will inherit the earth. Blessed are those who hunger and thirst for righteousness, for they will be filled.'''

In [None]:
check_meter(' '.join(['the', 'massachusetts']), dactylic, start=0, partial=True)

In [None]:
to_dactylic

In [None]:
to_dactylic = fit_meter(corpus, dactylic, return_early=30)