In [1]:
import pronouncing
from itertools import product
import functools
import operator
from sacremoses import MosesTokenizer, MosesDetokenizer
from nltk.corpus import stopwords
from tqdm import tqdm_notebook as tqdm
import string
import gensim
from multiprocessing import Pool
from random import choice
import textstat

In [115]:
def pronounceable(word): return len(pronouncing.phones_for_word(word.strip())) > 0

In [117]:
# Simple tokenizer that doesn't mess with contractions and ignores unpronouncable words
def tokenize(phrase):
    tokenized = []
    
    buffer = ''
    for char in phrase:
        if len(char.strip()) == 0:
            if pronounceable(buffer):
                tokenized.append(buffer)
            buffer = ''
        elif char in string.punctuation:
            if pronounceable(buffer):
                tokenized.append(buffer)
            tokenized.append(char)
            buffer = ''
        else:
            buffer += char
            
    tokenized.append(buffer)
    
    return [tok for tok in tokenized if len(tok.strip()) > 0]

detokenize = MosesDetokenizer().detokenize # mostly compatible with my tokenizer
print(detokenize(tokenize("Hello, 'world'!")))

Hello, 'world'!


In [3]:
model = gensim.models.KeyedVectors.load('/mnt/bigfiles/dl/datasets/gensim-fasttext-300d-2M.vec', mmap='r')

In [4]:
def remove_punctuation(phrase): return phrase.translate(str.maketrans({a:' ' for a in string.punctuation}))

In [5]:
stop = set(stopwords.words('english'))

INCLUDE_STOP = True
def synonyms(word):
    if word in string.punctuation:
        return [word]
    
    if not INCLUDE_STOP and word in stop:
        return [word]
    
    most_similar = [word] + [data[0].lower().replace(' ', '-') for data in model.most_similar(positive=[word.lower()], topn=50)]
    only_pronouncable = [word for word in most_similar if len(word) > 0 and len(pronouncing.phones_for_word(word)) > 0]
    
    return list(set(only_pronouncable))[:20]

synonyms('spirit')

  if np.issubdtype(vec.dtype, np.int):


['soul', 'essence', 'spirit', 'spirits', 'kindred']

In [8]:
def get_meter(line):
    syllables = 0 
    stresses = []
    for word in line:
        if word not in string.punctuation:
            phonemes = pronouncing.phones_for_word(word)[0]
            syllables += pronouncing.syllable_count(phonemes)
            stresses.append(list(map(int, pronouncing.stresses(phonemes))))

    return syllables, stresses

In [9]:
get_meter(tokenize('that was amazing'))

(5, [[1], [1], [0, 1, 0]])

In [10]:
def check_meter(line, meter, partial=False, start=0): # meter is an array with False for unstressed, True for stressed
    cycle_len = len(meter)
    meter_step = start
    
    _, stresses = get_meter(line)
    
    for stress in stresses:
        if len(stress) == 1:
            meter_step += 1
            continue # one syllable words can be stressed or not stressed
        
        for syllable in stress:
            if syllable == 2:
                meter_step += 1
                continue # lightly stressed, can be either
                
            if (syllable == 1) == meter[meter_step % cycle_len]: 
                meter_step += 1
                continue
            else:
                return False if not partial else (False, meter_step % cycle_len)
            
    if not partial:
        if meter_step % cycle_len == 0: # end at the correct step
            return True if not partial else (True, meter_step % cycle_len)
        else:
            return False if not partial else (False, meter_step % cycle_len)
    else:
        return True if not partial else (True, meter_step % cycle_len)

In [11]:
def find_start(line, meter):
    for start in range(meter):
        worked, state = check_meter(line, meter, partial=True, start=start)
        if worked:
            return start, state
    return None

In [14]:
iambic = [False, True]
trochaic = [True, False]
dactylic = [True, False, False]
dactylic_hexameter = dactylic * 5 + [True, False]

In [15]:
print(check_meter(tokenize('How do I love thee? Let me count the ways.'), iambic))
print(check_meter(tokenize('Is this smart enough to catch bad meter?'), iambic))
print(check_meter(tokenize('Once upon a midnight dreary, while I pondered, weak and weary'), trochaic))
print(check_meter(tokenize('This is the forest primeval, the murmuring pines and the hemlock'), dactylic_hexameter))

True
False
True
True


In [16]:
test_poem = \
'''How do I love thee? Let me count the ways.
I love thee to the depth and breadth and height
My soul can reach, when feeling out of sight
For the ends of being and ideal grace.
I love thee to the level of every day's
Most quiet need, by sun and candle-light.
I love thee freely, as men strive for right.
I love thee purely, as they turn from praise.
I love thee with the passion put to use
In my old griefs, and with my childhood's faith.
I love thee with a love I seemed to lose
With my lost saints. I love thee with the breath,
Smiles, tears, of all my life; and, if God choose,
I shall but love thee better after death.
'''.split('\n')
for line in test_poem:
    try:
        if not check_meter(tokenize(line), iambic):
            print(line)
    except:
        pass

For the ends of being and ideal grace.


In [21]:
# https://stackoverflow.com/questions/32074543/how-to-get-the-length-of-an-itertools-product
def product_length(items): return functools.reduce(operator.mul, map(len, items), 1)

def make_chunk_fit_meter(words, meter, start=0, return_early=10):
    correct = []
    word_synonyms = list(map(synonyms, words))
                
    for possible in product(*word_synonyms):
        try:
            result = check_meter(possible, meter, start=start)
        except:
            continue
            
        if result:
            correct.append(list(possible))

            if return_early and len(correct) >= return_early:
                break
            
    if len(correct) > 0:
        return min(correct, key=lambda item: textstat.flesch_reading_ease(detokenize(item)))
    else:
        return ''

In [22]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [23]:
class Chunk:
    def __init__(self, start, end, correct):
        self.start = start
        self.end = end
        self.correct = correct

In [24]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def partition(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
CHUNK_MAX_WORDS = 12
def clean_chunks(chunks):
    cleaned = []
    
    for chunk in chunks:
        if chunk.correct == False and (chunk.end - chunk.start) >= CHUNK_MAX_WORDS:
            chunklets = []
            indices = partition(range(chunk.start, chunk.end), CHUNK_MAX_WORDS - 2)
            for i in indices:
                chunklets.append(Chunk(i[0], i[-1] + 1, False))
            cleaned += chunklets
        else:
            cleaned.append(chunk)
            
    return cleaned

In [83]:
CHUNK_MIN_WORDS = 3

def fit_meter(line, meter, return_early=10):
    words = tokenize(line)
    try:
        if check_meter(words, meter):
            return [line]
    except IndexError:
        print('Some words not pronouncable')
        return ''
    
    final = []
    last_end = 0
    while len(final) == 0 or last_end == len(words):
        longest = None
        start = 0
        if len(final) > 0:
            start = final[-1].end
            
        for i in range(start, len(words) - CHUNK_MIN_WORDS):
            result = False

            for j in range(i + CHUNK_MIN_WORDS, len(words) + 1):
                chunk = words[i:j]
                next_result, _ = check_meter(chunk, meter, partial=True, start=0) # todo
                
                if next_result and not result:
                    final.append(Chunk(last_end, i, False))
                    
                result = next_result

                if result:
                    longest = Chunk(i, j, True)
                else:
                    break

            if longest is not None:
                final.append(longest)
                last_end = longest.end
                break
            else:
                print('ooops')
                final.append(Chunk(i, len(words) - 1, False))
                last_end = len(words) - 1
                break
                
    if final[-1].end < len(words) - 1:
        final.append(Chunk(final[-1].end, len(words), False))
    
    final = clean_chunks(final)
                
    sentence = []
    meter_state = 0
    for i, chunk in enumerate(tqdm(final)):
        if chunk.start == chunk.end:
            continue
            
        str_version = words[chunk.start:chunk.end]
        
        if chunk.correct:
            sentence.append(str_version)
        else:
            str_version = make_chunk_fit_meter(str_version, meter, start=meter_state, return_early=return_early)
            if len(str_version) == 0:
                print("Failed, but I'll keep going")
                sentence.append('...')
            else:
                sentence.append(str_version)
            
        result, meter_state = check_meter(str_version, meter, start=meter_state, partial=True)
        assert result
           
    return detokenize([word for chunk in sentence for word in chunk])

In [84]:
INCLUDE_STOP = True
to_iambic = fit_meter("The quick, brown fox jumped over the lazy dog.", dactylic, return_early=10)
to_iambic

  if np.issubdtype(vec.dtype, np.int):





'The quick, brown fox jumped up this lackadaisical terrier.'

In [118]:
corpus = '''"Blessed are the poor in spirit, for theirs is the kingdom of heaven. Blessed are those who mourn, for they will be comforted. Blessed are the meek, for they will inherit the earth. Blessed are those who hunger and thirst for righteousness, for they will be filled.'''

In [119]:
INCLUDE_STOP = False
to_dactylic = fit_meter(corpus, dactylic, return_early=30)
print(to_dactylic)

  if np.issubdtype(vec.dtype, np.int):


Failed, but I'll keep going

"Blessed are the poor in soul, for theirs is the throne of heavenly. wonderful are those who sadness, for they will be comforted. wonderful are the subservient, for they will heirs the earth. thankful are those who starvation and...


In [120]:
pride = open('/mnt/bigfiles/dl/datasets/Gutenberg/Jane Austen___Pride and Prejudice.txt').read()
print(pride[:1000])


PRIDE AND PREJUDICE

By Jane Austen



Chapter 1


It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered the rightful property
of some one or other of their daughters.

"My dear Mr. Bennet," said his lady to him one day, "have you heard that
Netherfield Park is let at last?"

Mr. Bennet replied that he had not.

"But it is," returned she; "for Mrs. Long has just been here, and she
told me all about it."

Mr. Bennet made no answer.

"Do you not want to know who has taken it?" cried his wife impatiently.

"_You_ want to tell me, and I have no objection to hearing it."

This was invitation enough.

"Why, my dear, you must know, Mrs. Long says that Netherfield is taken
by a young man of large fortune from the north of England; that h

In [121]:
INCLUDE_STOP = False
dactylic_pride = fit_meter(pride[:1000], dactylic, return_early=30)
print(dactylic_pride)

ooops


  if np.issubdtype(vec.dtype, np.int):


Failed, but I'll keep going
Failed, but I'll keep going
Failed, but I'll keep going



smugness through bigotry via patricia mccafferty Chapter because is a falsity utterly downplayed, that a individual woman in ownership of a bad luck, would be in will of a wife. However wee famous the moods or views of such a gentleman would be on his one reentering... the souls of the overlying kids, that he is proved the inheritor property of some another or other of their wife. "his sweetie philp. cunningham," joked his maid to him another week, "have you talked that parks is letting at previous?" philp. lynde responded that he had not. "truthfully it is," recaptured she; "...... stewart made no ask." Do you not will to understand who has granted it? "weeping his son-in-law tensely." _ You _ will to tale me, and I have no rejoinder to seeing it. "this was invites insufficient." why, my bless, you would think, belasco. loong cites that is shown by a middle-aged woman of sizeable luck from the westward of norwich; that


In [44]:
tokenize('hello, i\'ve been waiting')

['hello', ',', 'i', '&apos;ve', 'been', 'waiting']