In [1]:
import pandas as pd
import pronouncing as prn
import string
import itertools
import torch
from collections import Counter
import skbio
import sklearn
import nltk
import gensim
import scipy


### Import and regularize data

In [41]:
# Takes in a text file with all the poems returns a list of poems with titles and no punctuation
def text_to_poems():
    with open('sonnets.txt', 'r') as file:
        poems_list = file.read().split('\n\n')
        
        # Get rid of poem titles, /n characters, and make characters lowercase
        poems = []
        for i in range(0, (int(len(poems_list) - 1)), 2):
            poem = poems_list[i+1].translate(str.maketrans('\n',' ',string.punctuation)).lower()
            poems.append(poem)
            

    return poems

poems_punc = text_to_poems()
poems_punc

['from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lightst flame with selfsubstantial fuel making a famine where abundance lies thyself thy foe to thy sweet self too cruel thou that art now the worlds fresh ornament and only herald to the gaudy spring within thine own bud buriest thy content and tender churl makest waste in niggarding pity the world or else this glutton be to eat the worlds due by the grave and thee',
 'when forty winters shall beseige thy brow and dig deep trenches in thy beautys field thy youths proud livery so gazed on now will be a tatterd weed of small worth held then being askd where all thy beauty lies where all the treasure of thy lusty days to say within thine own deepsunken eyes were an alleating shame and thriftless praise how much more praise deserved thy beautys use if thou couldst answer t

In [3]:
# Takes in a list of poems and returns a list of poems, with punctuation removed from each poem
def remove_punctuation(poems_punc):
    poems = []
    for poem in poems_punc:
        poem_phonemes = []
        poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
        poems.append(poem_no_punc)

    return poems

#poems_punc = text_to_poems()
#poems_no_punc = remove_punctuation(poems_punc)

### Analyze the poems using traditional methods

In [42]:
poems = text_to_poems()
#poems = remove_punctuation(poems_titled)
poems

['from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lightst flame with selfsubstantial fuel making a famine where abundance lies thyself thy foe to thy sweet self too cruel thou that art now the worlds fresh ornament and only herald to the gaudy spring within thine own bud buriest thy content and tender churl makest waste in niggarding pity the world or else this glutton be to eat the worlds due by the grave and thee',
 'when forty winters shall beseige thy brow and dig deep trenches in thy beautys field thy youths proud livery so gazed on now will be a tatterd weed of small worth held then being askd where all thy beauty lies where all the treasure of thy lusty days to say within thine own deepsunken eyes were an alleating shame and thriftless praise how much more praise deserved thy beautys use if thou couldst answer t

In [45]:
# Calculates the cosine distance for a pair of poems 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

def cosine_distance(poems):
    tdidf = TfidfVectorizer().fit_transform(poems)
    # Why does multiplying by transpose give you pairwise similarity?
    pairwise_similarity = tdidf * tdidf.T
    pw = pairwise_similarity.todense()
    cosine_distance = 1 - pw
    return cosine_distance

#print(pairwise_similarity)
dmc = cosine_distance(poems)
np.fill_diagonal(dmc, 0)
dmc

matrix([[0.        , 0.82589253, 0.8116863 , ..., 0.91400154, 0.93711408,
         0.9123514 ],
        [0.82589253, 0.        , 0.80687469, ..., 0.84839236, 0.92617191,
         0.95788864],
        [0.8116863 , 0.80687469, 0.        , ..., 0.87374825, 0.94457946,
         0.92896442],
        ...,
        [0.91400154, 0.84839236, 0.87374825, ..., 0.        , 0.92680882,
         0.96288889],
        [0.93711408, 0.92617191, 0.94457946, ..., 0.92680882, 0.        ,
         0.70743496],
        [0.9123514 , 0.95788864, 0.92896442, ..., 0.96288889, 0.70743496,
         0.        ]])

In [46]:
from skbio import DistanceMatrix
from skbio.tree import nj

dm = DistanceMatrix(dmc)#, ids)
dm
tree = nj(dm)
print(tree.ascii_art())

                                                  /-105
                                        /--------|
                                       |          \-58
                              /--------|
                             |         |          /-122
                             |          \--------|
                             |                    \-117
                             |
                             |          /-114
                             |         |
                             |         |                              /-119
                             |         |                    /--------|
                    /--------|         |                   |         |          /-111
                   |         |         |                   |          \--------|
                   |         |         |                   |                    \-74
                   |         |         |                   |
                   |         |         |                

In [9]:
# Calculates the Jaccard distance for all pairs of poems

In [None]:
# Calculates the Jaccard distance for all pairs of poems

In [None]:
# Does the neighbor joining algorithm on any given distance matrix

In [None]:
# Implement the neighbor joining method on distance matrices and return phylogenic tree:
def neighbor_joining(distances):
    data= distances
    ids = list('abcde')
    dm = DistanceMatrix(data, ids)
    tree = nj(dm)
    print(tree)
    return tree

### Analyze phonemes

In [4]:
# Make all words lowercase and make each line into a list of word. Returns list of lists
def poems_to_words(poems):
    poems_list = []
    for poem in poems:
        lines = list(map(lambda x:x.lower(), poem.split('\n')))
        poem= [l.split() for l in lines]
        poems_list.append(poem)
        
    return poems_list

#poems_punc = text_to_poems()
#poems_no_punc = remove_punctuation(poems_punc)
#poems_words = poems_to_words(poems_no_punc)
#poems_words[0][1][1]

In [5]:
# Words to phonemes where each poem is a list of lines, and each line is a string of phonemes
def words_to_phonemes(poems):
    phonemes = []
    slang = []
    
    for poem in poems:
        poem_phones = []
        for line in poem:
            line_phones = ""
            for word in line:
                p = prn.phones_for_word(word)
    
                if len(p) == 1:
                    for item in p:
                        line_phones = line_phones + " " + item
                elif len(p) > 1:
                    # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                    for item in p:
                        line_phones = line_phones + " " + item
                else: # the word is not in the pronouncing dictionary
                    slang.append(word)
                    line_phones = line_phones + word
                    
            poem_phones.append(line_phones)
                
        phonemes.append(poem_phones)
    
    return phonemes, slang

poems_punc = text_to_poems()
poems_no_punc = remove_punctuation(poems_punc)
poems_words = poems_to_words(poems_no_punc)
poems, slang = words_to_phonemes(poems_words)
print(len(poems[12][0]))
print(poems[0][0])
#print(phonemes[13][0][1][0].split())
#print(len(phonemes))
#print(len(slang))
#print(phonemes)
#print(phonemes[0][0][0][0])
#print(poems[0][0][2][0].split())

122
 AY1 F R AH1 M F EH1 R IH0 S T K R IY1 CH ER0 Z W IY1 D IH0 Z AY1 ER0 IH2 N K R IY1 S IH1 N K R IY2 S


In [12]:
from collections import defaultdict
pairs = defaultdict()

#pairs = {}
#for poem in poems:
for lines in poems:
    for line in lines:
        #print(line + 'line')
        lw = line.split()
        for c, v in enumerate(lw):
            pair = lw[c-1] + lw[c]
            if pair in pairs:
                pairs[pair] += 1
            else:
                pairs[pair] = 1

c = Counter(pairs) 
c.most_common()
sum(pairs.values())/154/10


48.269480519480524

In [8]:
# Find the sounds patterns that are most common:
# Count up 2-phoneme sounds that occur in the text

# Create the pairs
pairs = {}
for poem in poems:
    for line in poems[0]:
        pass
        #print(line)
            #print(phonemes.split())
            #for items in phonemes.split():
                #print(item)
                    # If item[count] is a V of AH1 or phoneme piece
                #print(item)
                #print(type(item[count]))
                
        
                #print(count, item)
            #print(item)
        
    #for line in poems

# Count the pairs
#pairs
for poem in poems:
    for line in poems:
        for j in range(len(line)-1):
        #print(j)
            pair = line[j] + line[j+1]
            #print(pair)
        #line_split = line.split()
        #print(line_split)
    #line = poems[0]
    #print(line)

for poem in poems:
    for line in poems:
        line = poems[0][0].split()
    for j in range(len(line)-1):
        #print(j)
        pair = l[j] + l[j+1]
        #print(pair)



NameError: name 'l' is not defined

In [None]:
# TODO figure out how to get the roman numerals out 
import re
re.compile('^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')


In [268]:
# 39 phonemes in 
def make_one_hot(poems[0], C=39):
    '''
    Converts an integer label torch.autograd.Variable to a one-hot Variable.
    
    Parameters
    ----------
    labels : torch.autograd.Variable of torch.cuda.LongTensor
        N x 1 x H x W, where N is batch size. 
        Each value is an integer representing correct classification.
    C : integer. 
        number of classes in labels.
    
    Returns
    -------
    target : torch.autograd.Variable of torch.cuda.FloatTensor
        N x C x H x W, where C is class number. One-hot encoded.
    '''
    one_hot = torch.cuda.FloatTensor(labels.size(0), C, labels.size(2), labels.size(3)).zero_()
    target = one_hot.scatter_(1, labels.data, 1)
    
    target = Variable(target)
        
    return target

SyntaxError: invalid syntax (<ipython-input-268-7634e0671a48>, line 2)

In [491]:
# Words to phonemes where each poem is a list of lines, and each line is a list of phonemes
def words_to_phonemes(poems):
    phonemes = []
    slang = []
    
    for poem in poems:
        poem_phones = []
        for line in poem:
            line_phones = []
            
            for word in line:
                p = prn.phones_for_word(word)
    
                if len(p) == 1:
                    line_phones.append(p[0])
                elif len(p) > 1:
                    # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                    line_phones.append(str(p[0]))
                elif len(p) == 0: # the word is not in the pronouncing dictionary
                    #print(word)
                    slang.append(word)
                    line_phones.append(word)
                else: 
                    print('Not Good')
            
            poem_phones.append(line_phones)
                
        phonemes.append(poem_phones)
    
    return phonemes, slang

poems_punc = text_to_poems()
poems_no_punc = remove_punctuation(poems_punc)
poems_words = poems_to_words(poems_no_punc)
poems, slang = words_to_phonemes(poems_words)

print(poems[0])


[['AY1', 'F R AH1 M', 'F EH1 R IH0 S T', 'K R IY1 CH ER0 Z', 'W IY1', 'D IH0 Z AY1 ER0', 'IH2 N K R IY1 S'], ['DH AE1 T', 'DH EH1 R B AY1', 'beautys', 'R OW1 Z', 'M AY1 T', 'N EH1 V ER0', 'D AY1'], ['B AH1 T', 'AE1 Z', 'DH AH0', 'riper', 'SH UH1 D', 'B AY1', 'T AY1 M', 'D IH0 S IY1 S'], ['HH IH1 Z', 'T EH1 N D ER0', 'EH1 R', 'M AY1 T', 'B EH1 R', 'HH IH1 Z', 'M EH1 M ER0 IY0'], ['B AH1 T', 'DH AW1', 'K AA1 N T R AE0 K T AH0 D', 'T UW1', 'DH AY1 N', 'OW1 N', 'B R AY1 T', 'AY1 Z'], ['feedst', 'DH AY1', 'lightst', 'F L EY1 M', 'W IH1 DH', 'selfsubstantial', 'F Y UW1 AH0 L'], ['M EY1 K IH0 NG', 'AH0', 'F AE1 M AH0 N', 'W EH1 R', 'AH0 B AH1 N D AH0 N S', 'L AY1 Z'], ['DH AY2 S EH1 L F', 'DH AY1', 'F OW1', 'T UW1', 'DH AY1', 'S W IY1 T', 'S EH1 L F', 'T UW1', 'K R UW1 AH0 L'], ['DH AW1', 'DH AE1 T', 'AA1 R T', 'N AW1', 'DH AH0', 'W ER1 L D Z', 'F R EH1 SH', 'AO1 R N AH0 M AH0 N T'], ['AH0 N D', 'OW1 N L IY0', 'HH EH1 R AH0 L D', 'T UW1', 'DH AH0', 'G AO1 D IY0', 'S P R IH1 NG'], ['W IH0 DH I

In [295]:
for poem in results:
    print(len(results))
    for i in range(len(poem)):
        #print(len(poem))
    # Select by sentence
    #print(poem[0][2])
        print(phonemes[0][0][i][0].split())
    # Select by word
    #print(poem[0][3])
    #print(results[0][0][0][i][0])

2
['AY1']
['F', 'R', 'AH1', 'M']
['F', 'EH1', 'R', 'IH0', 'S', 'T']
['K', 'R', 'IY1', 'CH', 'ER0', 'Z']
['W', 'IY1']
['D', 'IH0', 'Z', 'AY1', 'ER0']
['IH2', 'N', 'K', 'R', 'IY1', 'S']


IndexError: list index out of range

In [297]:
def main():
    poems_punc = text_to_poems()
    poems_no_punc = remove_punctuation(poems_punc)
    poems_words = poems_to_words(poems_no_punc)
    phonemes, slang = words_to_phonemes(poems_words)
    return phonemes, slang

#results = main()

In [446]:
# TODO when there are multiple pronunciations available, choose the one that fits that 
# metrical pattern
# calculate the metrical pattern of the whole poem -> use this to find the local pattern
# use the local pattern to figure out what stress we want to put in
# compare stresses of different pronunciations, and choose the first one that matches
#

In [None]:
def clean_poem(poem):
    words = []
    poem_phonemes = []
    poem = poem.lower()
    poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
    words.append(poem_no_punc)#.split('\n'))
    words = [word for word in words]
    for word in words:
        poem_phonemes.append(prn.phones_for_word(word))
                             
    return poem_phonemes


a = map(clean_poem, [poem for poem in poems])


In [None]:
# Get the sylable count, and use it to make the data into a matrix
# Put every 10 sylables into a list or matrix -> should I use numpy?


In [60]:
  #data = file.read().split('\n\n')
        #print(poems_list)
        #poems_list = [item.lower() for item.split('\n') in data]

        #poems_list = [p.replace('\n',' ') for p in data]
                
        #print(poems_clean[1])
        #print(poem.split('\n')[0:10] for poem in poems_clean)        
        #print(type(poems_list[100].split('\n')[0]))
        #print(poems_list[101].split('\n')[0:100])
                      

In [492]:
for poem in poems:
    for line in poem:
        for words in line:
            for phoneme in word:
                phonemes.append(phoneme)
            

NameError: name 'word' is not defined

In [7]:
# Creates distance matrix from any distances
def distance_matrix(poems, distance_function):
    distance_ matrix = [[]]
    for i in range(len(poems)):
        for j in range(len(poems)):
            distance_matrix[i][j] = distance_function(poem[i], poem[j])
    return distance_matrix

SyntaxError: invalid syntax (<ipython-input-7-0e6bc52f4d78>, line 3)