### Import and Clean Poems

In [47]:
import string
from Bio import Phylo
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
import pronouncing as prn
import gensim
import scipy
import ete3

In [2]:
# Returns a list of poems, lowecase, no punctuation
def poems_list(file_in):
    with open(file_in, 'r') as file:
        poems_list = file.read().split('\n\n')
        # Get rid of poem titles, /n characters, and make characters lowercase
        poems = [item.translate(str.maketrans('\n',' ',string.punctuation)).lower() for item in poems_list[1::2]]
            
    return poems

In [3]:
# Returns a list of poems where poem is list of lines, lowecase, no punctuation
def make_poems_lines(file_in):
    
    '''Takes a text file of poems, returns a list of poems with titles and no punctuation'''
    
    poems = []    
    with open(file_in, 'r') as file:
        poems_raw = file.read().split('\n\n')
        poems_clean = [poem.split('\n')for poem in poems_raw]
        for i in range(0, (int(len(poems_clean) - 1)), 2):
            poem = poems_clean[i+1]
            poems.append(poem)
            
    poems_clean = []
    for poem in poems:
        poem_clean = []
        for line in poem:
            poem_no_punc = line.translate(str.maketrans('','',string.punctuation)).rstrip().lower()
            poem_clean.append(poem_no_punc)

            
        poems_clean.append(poem_clean)

    return poems_clean

In [4]:
poems_lines = make_poems_lines('sonnets.txt')

In [5]:
poems_list_words = poems_list('sonnets.txt')

### Words to Phonemes

In [6]:
# Returns list of poems with phonemes instead of words, and list of slang words
def poems_to_phones(poems):
    poems_phones = []
    poems_slang = []
    
    for poem in poems:
        poem_phones = ""
        poem_slang = []
        
        words = poem.split()
        for word in words:
            p = prn.phones_for_word(word)
            if len(p) == 1: # there is only one pronunciation
                poem_phones = poem_phones + p[0] + " "
            elif len(p) > 1: # there is more than one pronunciation
                # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                poem_phones = poem_phones + p[0] + " "
            else: # the word is not in the pronouncing dictionary
                poem_slang.append(word)
                poem_phones = poem_phones + word + " "
        
        poems_slang.append(poem_slang)    
        poems_phones.append(poem_phones)
        
    return poems_phones, poems_slang

In [7]:
phonemes, slang = poems_to_phones(poems_list_words)

### Phonemes to Phoneme Types

In [8]:
# Returns list of poems with phonemes instead of words, and list of slang words
def phones_to_type(poems, do_vowels=False):
    poems_types = []
    
    for poem in poems:
        poem_types = ""
        phones = poem.split()
        
        for phone in phones:
            if phone in ['F', 'V', 'S', 'Z','HH', 'TH', 'SH', 'ZH', 'JH']:
                poem_types = poem_types + 'fric' + " "
            elif phone in ['P', 'B', 'T', 'D', 'K', 'G', 'CH', 'DH']:
                poem_types = poem_types + 'plos' + " "
            elif phone in ['M', 'N', 'NG']:
                poem_types = poem_types + 'nasal' + " "
            elif phone in ['L', 'R', 'W', 'Y']:
                poem_types = poem_types + 'approx' + " "

            if do_vowels is True: # to-do make this add specific vowel types
                poem_types = poem_types + 'vowel' + " "
            else:
                pass
                

        
        poems_types.append(poem_types)
        
    return poems_types

In [9]:
phon_types = phones_to_type(phonemes)
phon_types

['fric approx nasal fric approx fric plos plos approx plos fric approx plos fric nasal plos approx fric plos plos plos approx plos approx fric nasal plos nasal fric plos plos plos fric plos fric plos plos plos nasal plos fric fric fric fric plos nasal plos approx nasal plos plos approx fric fric nasal nasal plos plos plos plos nasal plos approx plos plos plos plos plos nasal nasal plos approx plos fric plos fric approx nasal approx plos fric approx approx nasal plos nasal fric nasal nasal approx approx plos nasal plos nasal fric approx fric plos fric approx fric plos fric plos plos fric approx plos fric approx fric plos plos approx approx plos plos plos approx plos nasal plos approx approx plos fric fric approx fric approx nasal nasal nasal plos nasal plos nasal approx fric approx approx plos plos plos plos plos fric plos approx nasal approx plos nasal plos nasal nasal plos plos plos plos nasal plos nasal plos nasal plos plos nasal plos approx fric plos nasal plos plos plos approx appr

### Td-idf of Words

In [10]:
# Get the pairwise similarity for each poem using td-idf
def tdidf_dist(poems):
    tdidf = TfidfVectorizer().fit_transform(poems)
    pair_sim = (tdidf * tdidf.T).todense()
    cos_dist = 0 + pair_sim
    np.fill_diagonal(cos_dist, 0)
    return cos_dist

In [11]:
tdidf_words = tdidf_dist(poems_list_words)

In [12]:
tdidf_phonemes = tdidf_dist(phonemes)

### Make Trees and Networks from Distance Matrices

In [13]:
from skbio import DistanceMatrix
from skbio.tree import nj

In [18]:
def make_tree(dist_mat, file, inline=False, rooted=False):
    dm = DistanceMatrix(dist_mat)#, ids)
    
    if rooted is True:
        tree = nj(dm, disallow_negative_branch_length=False).root_at_midpoint()
    else:
        tree = nj(dm, disallow_negative_branch_length=False)

    if inline is True:
        print(tree.ascii_art())
    else:
        with open(file, 'w') as f:
            f.write(tree.ascii_art())        

In [19]:
make_tree(tdidf_phonemes, 'tree_phonemes.txt', inline=True)

                                                  /-52
                                        /--------|
                                       |          \-46
                              /--------|
                             |         |          /-147
                             |          \--------|
                             |                    \-19
                    /--------|
                   |         |                    /-69
                   |         |          /--------|
                   |         |         |         |          /-142
                   |         |         |          \--------|
                   |          \--------|                    \-82
                   |                   |
                   |                   |          /-28
                   |                    \--------|
                   |                             |          /-100
          /--------|                              \--------|
         |         |             

In [None]:
make_tree(tdidf_words, 'tree_words.txt')

### Make Newick

In [20]:
def make_newick(dist_mat, file_out):
    dm = DistanceMatrix(dist_mat)
    tree = nj(dm, disallow_negative_branch_length=False)
    tree.write(file_out)

In [21]:
make_newick(tdidf_words, 'newick_words.txt')    

### Use ETE to make tree images

In [85]:
from ete3 import Tree, faces, AttrFace, TreeStyle, NodeStyle

def layout(node):
    # If node is a leaf, add the nodes name and a its scientific name
    if node.is_leaf():
        faces.add_face_to_node(AttrFace("name"), node, column=0)

def make_circular_tree(): 
    t = Tree("newick_words.txt")
    
    # Set bold branch to the root node
    style = NodeStyle()
    style["fgcolor"] = "#0f0f0f"
    style["size"] = 0
    style["vt_line_color"] = "#003B70"
    style["hz_line_color"] = "#003B70"
    style["vt_line_width"] = 7
    style["hz_line_width"] = 7
    style["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style["hz_line_type"] = 0
    t.set_style(style)
    
    #Set solid lines to the first two branches
    style1 = NodeStyle()
    style1["fgcolor"] = "#0f0f0f"
    style1["size"] = 0
    style1["vt_line_color"] = "#003B70"
    style1["hz_line_color"] = "#003B70"
    style1["vt_line_width"] = 7
    style1["hz_line_width"] = 7
    style1["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style1["hz_line_type"] = 0
        
    # Set thick dashed blue lines 
    style3 = NodeStyle()
    style3["shape"] = "circle"
    style3["vt_line_color"] = "#003B70"
    style3["hz_line_color"] = "#003B70"
    style3["vt_line_width"] = 5
    style3["hz_line_width"] = 5
    style3["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style3["hz_line_type"] = 0
    
    # Set thin dashed blue lines 
    style4 = NodeStyle()
    style4["shape"] = "circle"
    style4["vt_line_color"] = "#003B70"
    style4["hz_line_color"] = "#003B70"
    style4["vt_line_width"] = 4
    style4["hz_line_width"] = 4
    style4["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style4["hz_line_type"] = 0
    
    # Set thin solid blue lines 
    style5 = NodeStyle()
    style5["shape"] = "circle"
    style5["vt_line_color"] = "#003B70"
    style5["hz_line_color"] = "#003B70"
    style5["vt_line_width"] = 3
    style5["hz_line_width"] = 3
    style5["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style5["hz_line_type"] = 0
    
    # Set thin dashed blue lines 
    style6 = NodeStyle()
    style6["shape"] = "circle"
    style6["vt_line_color"] = "#003B70"
    style6["hz_line_color"] = "#003B70"
    style6["vt_line_width"] = 3
    style6["hz_line_width"] = 3
    style6["vt_line_type"] = 1 # 0 solid, 1 dashed, 2 dotted
    style6["hz_line_type"] = 1
    
    style7 = NodeStyle()
    style7["shape"] = "circle"
    style7["vt_line_color"] = "#003B70"
    style7["hz_line_color"] = "#003B70"
    style7["vt_line_width"] = 6
    style7["hz_line_width"] = 6
    style7["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted
    style7["hz_line_type"] = 0
    
    # Set dotted blue lines in all leaves
    style2 = NodeStyle()
    style2["fgcolor"] = "#000000"
    style2["shape"] = "circle"
    style2["vt_line_color"] = "#003B70"
    style2["hz_line_color"] = "#003B70"
    style2["vt_line_width"] = 3
    style2["hz_line_width"] = 3
    style2["vt_line_type"] = 2 # 0 solid, 1 dashed, 2 dotted
    style2["hz_line_type"] = 2
    for l in t.iter_leaves():
        l.img_style = style2
        
    #for node in t.traverse("postorder"):
        #node.img_style["size"] = 15
        #print(dir(node.img_style))
        
    for node in t.children:
        node.img_style = style1

        for child in node.children:
            child.img_style = style1

            for childer in child.children:
                childer.img_style = style1

                for childest in childer.children:
                    childest.img_style = style7

                    for next_of_kin in childest.children:
                        next_of_kin.img_style = style3
                        
                        for last_surviving in next_of_kin.children:
                            last_surviving.img_style = style4
                            
                            for a in last_surviving.children:
                                a.img_style = style5
                                
                                for b in a.children:
                                    b.img_style = style6

        #item.children[1].img_style = style1
        #item.children[2].img_style = style1

    ts = TreeStyle()
    #ts.scale = 5520
    ts.optimal_scale_level = 'mid'
    ts.show_leaf_name = 'True'
    ts.mode = "c"
    ts.show_scale = 'False'
    ts.layout_fn = layout
    ts.complete_branch_lines_when_necessary = 'False'
    ts.show_leaf_name = False
    ts.root_opening_factor = 1
     

    return t, ts

t, ts = make_circular_tree()
#t.show(tree_style=ts)
t.render("phylotree_circular_open.png", w=5000, tree_style=ts)

{'faces': [[4153.847726147054,
   1699.263778681213,
   4227.6740176178055,
   1773.0900701519638,
   289,
   '10'],
  [764.975492855526,
   2839.1082956693917,
   805.250981920512,
   2899.6686841469964,
   147,
   '4'],
  [4299.277019812506,
   2845.003951833261,
   4365.201381131796,
   2910.928313152551,
   14,
   '69'],
  [3860.1331528542783,
   3719.0229266695624,
   3938.937215254516,
   3797.8269890698,
   39,
   '62'],
  [1354.871901314489,
   986.8575991716098,
   1432.7900196510184,
   1064.7757175081392,
   201,
   '84'],
  [4110.728450255571,
   3517.685742011035,
   4187.813320993101,
   3594.770612748565,
   32,
   '56'],
  [985.058442642077,
   3188.2412282611513,
   1059.2813884989025,
   3262.464174117977,
   135,
   '65'],
  [927.4858443435558,
   3446.6571465968477,
   1029.863565247091,
   3539.5746541753656,
   130,
   '144'],
  [514.5334372648808,
   3055.2729708803818,
   583.4323310486301,
   3124.1718646641307,
   142,
   '94'],
  [3395.309024121726,
   890.78

In [29]:
from ete3 import Tree, TreeStyle, CircleFace, TextFace, NodeStyle
ts = TreeStyle()
ts.scale = 6520
#ts.rotation = 90
ts.mode = "c" # draw tree in circular mode
t = Tree( "newick_words.txt" )

nstyle = NodeStyle()
nstyle["hz_line_type"] = 0
for n in t.traverse():
    n.set_style(nstyle)


#t.render("phylotree_circular.png", h = 9500, dpi=1600, tree_style=ts)

#ts.legend.add_face(CircleFace(10, "red"), column=0)
#ts.legend.add_face(TextFace("0.5 support"), column=1)

t.show(tree_style = ts)


### Count phoneme frequencies and use to make distance matrix

In [250]:
def count_phon_types(poems):
    dict_list = []
    
    for poem in poems:
        type_ct = dict([('nasal', 0), ('plos', 0), ('fric', 0), ('approx', 0)])
        
        for item in poem.split():
            if item == 'nasal':
                type_ct['nasal'] = type_ct['nasal'] + 1
            elif item == 'plos':
                type_ct['plos'] = type_ct['plos'] + 1
            elif item == 'fric':
                type_ct['fric'] = type_ct['fric'] + 1
            elif item == 'approx':
                type_ct['approx'] = type_ct['approx'] + 1
        
        dict_list.append(list(type_ct.values()))
        
    return dict_list

In [252]:
phon_cts = count_phon_types(phon_types)
phon_cts

[[35, 38, 41, 88],
 [36, 44, 48, 74],
 [29, 61, 38, 79],
 [24, 55, 38, 86],
 [33, 54, 59, 81],
 [36, 41, 38, 85],
 [38, 54, 44, 65],
 [47, 50, 50, 85],
 [30, 62, 52, 89],
 [31, 52, 36, 82],
 [34, 52, 46, 82],
 [42, 61, 38, 93],
 [36, 58, 59, 68],
 [42, 57, 42, 89],
 [37, 58, 44, 75],
 [48, 48, 64, 75],
 [37, 62, 50, 76],
 [47, 63, 40, 71],
 [46, 51, 41, 86],
 [46, 57, 38, 72],
 [41, 67, 49, 69],
 [38, 47, 44, 79],
 [43, 69, 46, 77],
 [41, 58, 40, 86],
 [31, 56, 51, 77],
 [42, 46, 39, 88],
 [48, 46, 46, 82],
 [45, 41, 35, 95],
 [55, 62, 42, 85],
 [49, 61, 44, 65],
 [34, 58, 50, 75],
 [35, 60, 39, 82],
 [57, 63, 52, 64],
 [37, 50, 50, 87],
 [50, 68, 38, 77],
 [40, 44, 43, 85],
 [29, 57, 43, 93],
 [47, 54, 39, 91],
 [46, 51, 40, 77],
 [33, 51, 50, 69],
 [31, 42, 54, 88],
 [42, 72, 43, 69],
 [38, 51, 40, 87],
 [44, 67, 42, 85],
 [41, 49, 35, 80],
 [35, 51, 42, 100],
 [42, 54, 37, 91],
 [33, 64, 54, 81],
 [47, 68, 38, 89],
 [47, 57, 36, 88],
 [44, 59, 46, 74],
 [35, 66, 46, 77],
 [42, 51, 4

In [253]:
def cos_dist_phon_cts(phon_cts):
    dist_mat = np.zeros((153, 153))
    for i in range(153):
        poem1 = phon_cts[i]

        for j in range(153):
            poem2 = phon_cts[j]
            dist = scipy.spatial.distance.cosine(poem1, poem2)

            dist_mat[i,j] = dist
    return dist_mat

In [254]:
phon_type_cts_dm = cos_dist_phon_cts(phon_cts)     
make_tree(phon_type_cts_dm, 'tree_type_cts.txt')

### Count pairs of phonemes

In [265]:
from collections import defaultdict
pairs = {}

def count_pairs(poems):
    for poem in poems:
        
        lw = poem.split()
        for c, v in enumerate(lw):
            pair = lw[c-1] + lw[c]
            if pair in pairs:
                pairs[pair] += 1
            else:
                pairs[pair] = 1
                
    return pairs

phon_pair_cts = count_pairs(phonemes)
type(phon_pair_cts.values())
filtered_dict = {k:v for k,v in phon_pair_cts.items() if v > 10}
filtered_dict

#c = Counter(pairs) 
#c.most_common()
#sum(pairs.values())/154/10
#Can I do tdidf of the pairs?

{'AA1D': 11,
 'AA1L': 18,
 'AA1M': 16,
 'AA1N': 164,
 'AA1R': 317,
 'AA1S': 35,
 'AA1T': 188,
 'AA1Z': 33,
 'AE0N': 13,
 'AE1D': 62,
 'AE1F': 16,
 'AE1K': 44,
 'AE1L': 80,
 'AE1M': 41,
 'AE1N': 203,
 'AE1NG': 26,
 'AE1P': 30,
 'AE1S': 66,
 'AE1T': 361,
 'AE1TH': 47,
 'AE1V': 87,
 'AE1Z': 121,
 'AH0B': 71,
 'AH0D': 87,
 'AH0F': 64,
 'AH0G': 52,
 'AH0HH': 19,
 'AH0JH': 28,
 'AH0K': 61,
 'AH0L': 268,
 'AH0M': 118,
 'AH0N': 922,
 'AH0P': 91,
 'AH0R': 21,
 'AH0S': 210,
 'AH0T': 121,
 'AH0TH': 13,
 'AH0V': 21,
 'AH0W': 89,
 'AH0Z': 42,
 'AH1B': 14,
 'AH1CH': 53,
 'AH1D': 19,
 'AH1DH': 47,
 'AH1F': 12,
 'AH1L': 20,
 'AH1M': 194,
 'AH1N': 125,
 'AH1NG': 30,
 'AH1P': 19,
 'AH1S': 66,
 'AH1T': 238,
 'AH1TH': 19,
 'AH1V': 610,
 'AO1F': 17,
 'AO1L': 183,
 'AO1N': 20,
 'AO1NG': 49,
 'AO1R': 674,
 'AO1S': 21,
 'AO1T': 58,
 'AO1TH': 89,
 'AO2R': 27,
 'AW1AA1': 35,
 'AW1AH0': 14,
 'AW1B': 20,
 'AW1D': 38,
 'AW1DH': 25,
 'AW1ER0': 68,
 'AW1F': 12,
 'AW1HH': 16,
 'AW1K': 12,
 'AW1L': 12,
 'AW1M': 18,
 '

### Old

In [7]:
import pronouncing as prn
def poems_to_phonemes(poems): 
    phonemes = []
    slang = []
    
    for poem in poems:
        poem_phonemes = []
        poem_slang = []
        
        for line in poem:
            line_phonemes = []
            
            for word in line.split():
                p = prn.phones_for_word(word)
                if len(p) == 1: # there is only one pronunciation
                    line_phonemes.append(p[0])
                
                elif len(p) > 1: # there is more than one pronunciation
                    # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                    line_phonemes.append(p[0]) 
                
                else: # the word is not in the pronouncing dictionary
                    poem_slang.append(word)
                    line_phonemes.append(word)
                    
            poem_phonemes.append(line_phonemes)
        
        phonemes.append(poem_phonemes)
        
        slang.append(poem_slang)
    
    return phonemes, slang
                                       
phonemes, slang = poems_to_phonemes(poems_clean) 

In [45]:
ph_lines = []
for line in phonemes:
    input = [" ".join(x) for x in line] 
    ph_lines.append(input)
ph_lines

[['F R AH1 M F EH1 R IH0 S T K R IY1 CH ER0 Z W IY1 D IH0 Z AY1 ER0 IH2 N K R IY1 S',
  'DH AE1 T DH EH1 R B AY1 beautys R OW1 Z M AY1 T N EH1 V ER0 D AY1',
  'B AH1 T AE1 Z DH AH0 riper SH UH1 D B AY1 T AY1 M D IH0 S IY1 S',
  'HH IH1 Z T EH1 N D ER0 EH1 R M AY1 T B EH1 R HH IH1 Z M EH1 M ER0 IY0',
  'B AH1 T DH AW1 K AA1 N T R AE0 K T AH0 D T UW1 DH AY1 N OW1 N B R AY1 T AY1 Z',
  'feedst DH AY1 lightst F L EY1 M W IH1 DH selfsubstantial F Y UW1 AH0 L',
  'M EY1 K IH0 NG AH0 F AE1 M AH0 N W EH1 R AH0 B AH1 N D AH0 N S L AY1 Z',
  'DH AY2 S EH1 L F DH AY1 F OW1 T UW1 DH AY1 S W IY1 T S EH1 L F T UW1 K R UW1 AH0 L',
  'DH AW1 DH AE1 T AA1 R T N AW1 DH AH0 W ER1 L D Z F R EH1 SH AO1 R N AH0 M AH0 N T',
  'AH0 N D OW1 N L IY0 HH EH1 R AH0 L D T UW1 DH AH0 G AO1 D IY0 S P R IH1 NG',
  'W IH0 DH IH1 N DH AY1 N OW1 N B AH1 D buriest DH AY1 K AA1 N T EH0 N T',
  'AH0 N D T EH1 N D ER0 churl makest W EY1 S T IH0 N niggarding',
  'P IH1 T IY0 DH AH0 W ER1 L D AO1 R EH1 L S DH IH1 S glutton B I

### Compare Poems

### Import data

In [None]:
import itertools
def compare():
    for a, b in itertools.combinations(mylist, 2):
    distance = compare(a, b)
    return distance
# td-idf creates this all at once - what about word2vec

In [48]:
def read_files(file_in):
    '''Takes a text file of poems, returns a list of poems with titles and no punctuation'''
    poems_titled = []    
    with open(file_in, 'r') as file:
        poems_raw = file.read().split('\n\n')

        # Poems and their titles got split up when parsing - reunite them
        for i in range(0, (int(len(poems_raw) - 1)), 2):
            poem_with_title = poems_raw[i] + " " + poems_raw[i+1]
            poems_titled.append(poem_with_title)

    return poems_titled
poems_punc = read_files('sonnets.txt')

In [49]:
def remove_punctuation(poems_punc):

    '''Takes a list of poems, and returns a list of poems with punctuation removed from each poem'''

    for poem in poems_punc:
        poem_phonemes = []
        poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
        poems_no_punc.append(poem_no_punc)

    return poems_no_punc
poems_no_punc = remove_punctuation(poems_punc)

### phone2vec

In [41]:
from gensim.models import Word2Vec
def word2vec():
    model = Word2Vec(sentences, min_count=1)

### Create Poem Object

In [9]:
# Calculates the Jaccard distance for all pairs of poems

### Analyze phonemes

In [85]:
# Find the sounds patterns that are most common:
# Count up 2-phoneme sounds that occur in the text

# Create the pairs
pairs = {}
for poem in poems:
    for line in poems[0]:
        pass
        #print(line)
            #print(phonemes.split())
            #for items in phonemes.split():
                #print(item)
                    # If item[count] is a V of AH1 or phoneme piece
                #print(item)
                #print(type(item[count]))
                
        
                #print(count, item)
            #print(item)
        
    #for line in poems





NameError: name 'poems' is not defined

48.269480519480524

[' AY1', ' ', ' EH1 F', ' AA1 R', ' OW1', ' EH1 M', ' ', ' EH1 F', ' AH0 EY1', ' AY1', ' AA1 R', ' IY1', ' EH1 S', ' T IY1', ' ', ' S IY1', ' AA1 R', ' IY1', ' AH0 EY1', ' T IY1', ' Y UW1', ' AA1 R', ' IY1', ' EH1 S', ' ', ' D AH1 B AH0 L Y UW0', ' IY1', ' ', ' D IY1', ' IY1', ' EH1 S', ' AY1', ' AA1 R', ' IY1', ' ', ' AY1', ' EH1 N', ' S IY1', ' AA1 R', ' IY1', ' AH0 EY1', ' EH1 S', ' IY1', '\n', ' T IY1', ' EY1 CH', ' AH0 EY1', ' T IY1', ' ', ' T IY1', ' EY1 CH', ' IY1', ' AA1 R', ' IY1', ' B IY1', ' W AY1', ' ', ' B IY1', ' IY1', ' AH0 EY1', ' Y UW1', ' T IY1', ' W AY1', ' EH1 S', ' ', ' AA1 R', ' OW1', ' EH1 S', ' IY1', ' ', ' EH1 M', ' AY1', ' JH IY1', ' EY1 CH', ' T IY1', ' ', ' EH1 N', ' IY1', ' V IY1', ' IY1', ' AA1 R', ' ', ' D IY1', ' AY1', ' IY1', '\n', ' B IY1', ' Y UW1', ' T IY1', ' ', ' AH0 EY1', ' EH1 S', ' ', ' T IY1', ' EY1 CH', ' IY1', ' ', ' AA1 R', ' AY1', ' P IY1', ' IY1', ' AA1 R', ' ', ' EH1 S', ' EY1 CH', ' OW1', ' Y UW1', ' EH1 L', ' D IY1', ' ', ' B IY1', ' W 

In [None]:
for poem in poems:
    for line in poems:
        line = poems[0][0].split()
    for j in range(len(line)-1):
        #print(j)
        pair = l[j] + l[j+1]
        #print(pair)

In [None]:
# Count the pairs
#pairs
for poem in poems:
    for line in poems:
        for j in range(len(line)-1):
        #print(j)
            pair = line[j] + line[j+1]
            #print(pair)
        #line_split = line.split()
        #print(line_split)
    #line = poems[0]
    #print(line)


In [268]:
# 39 phonemes in 
def make_one_hot(poems[0], C=39):
    '''
    Converts an integer label torch.autograd.Variable to a one-hot Variable.
    
    Parameters
    ----------
    labels : torch.autograd.Variable of torch.cuda.LongTensor
        N x 1 x H x W, where N is batch size. 
        Each value is an integer representing correct classification.
    C : integer. 
        number of classes in labels.
    
    Returns
    -------
    target : torch.autograd.Variable of torch.cuda.FloatTensor
        N x C x H x W, where C is class number. One-hot encoded.
    '''
    one_hot = torch.cuda.FloatTensor(labels.size(0), C, labels.size(2), labels.size(3)).zero_()
    target = one_hot.scatter_(1, labels.data, 1)
    
    target = Variable(target)
        
    return target

SyntaxError: invalid syntax (<ipython-input-268-7634e0671a48>, line 2)

### Analyze the poems using traditional methods

In [446]:
# TODO when there are multiple pronunciations available, choose the one that fits that 
# metrical pattern
# calculate the metrical pattern of the whole poem -> use this to find the local pattern
# use the local pattern to figure out what stress we want to put in
# compare stresses of different pronunciations, and choose the first one that matches
#

### Calculate cosine distances b/w poems

In [None]:
def clean_poem(poem):
    words = []
    poem_phonemes = []
    poem = poem.lower()
    poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
    words.append(poem_no_punc)#.split('\n'))
    words = [word for word in words]
    for word in words:
        poem_phonemes.append(prn.phones_for_word(word))
                             
    return poem_phonemes


a = map(clean_poem, [poem for poem in poems])


In [None]:
# Get the sylable count, and use it to make the data into a matrix
# Put every 10 sylables into a list or matrix -> should I use numpy?


In [9]:
# Takes in a text file with all the poems returns a list of poems with titles and no punctuation
def text_to_poems():
    with open('sonnets.txt', 'r') as file:
        poems_list = file.read().split('\n\n')
        
        # Poems and their titles got split up when parsing - reunite them
        poems_titled = []
        for i in range(0, (int(len(poems_list) - 1)), 2):
            poem_with_title = poems_list[i] + " " + poems_list[i+1]
            poems_titled.append(poem_with_title)
               
    return poems_titled

#poems_punc = text_to_poems()

In [11]:
# Make all words lowercase and make each line into a list of word. Returns list of lists
def poems_to_words(poems):
    poems_list = []
    for poem in poems:
        lines = list(map(lambda x:x.lower(), poem.split('\n')))
        poem= [l.split() for l in lines]
        poems_list.append(poem)
        
    return poems_list

#poems_punc = text_to_poems()
#poems_no_punc = remove_punctuation(poems_punc)
#poems_words = poems_to_words(poems_no_punc)
#poems_words[0][1][1]

In [9]:
# Takes in a list of poems and returns a list of poems, with punctuation removed from each poem
def remove_punctuation(poems_punc):
    poems = []
    for poem in poems_punc:
        poem_phonemes = []
        poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
        poems.append(poem_no_punc)

    return poems

#poems_punc = text_to_poems()
#poems_no_punc = remove_punctuation(poems_punc)

In [None]:
# Poems and their titles got split up when parsing - reunite them
    for i in range(0, (int(len(poems_clean) - 1)), 2):
        #poem_with_title = poems_raw[i] + " " + poems_raw[i+1]
        poem = poems_clean[i+1]
        poems.append(poem)

In [None]:
import pronouncing as prn
def poems_to_phonemes_str(poems):    
    all_slang = []
    for poem in poems:
        poem_phonemes = []
        for line in poem:
            line_phones = ""
            for word in line:
                p = prn.phones_for_word(word)

                if len(p) == 1:
                    for item in p:
                        line_phones = line_phones + " " + item
                elif len(p) > 1:
                    # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                    for item in p:
                        line_phones = line_phones + " " + item
                else: # the word is not in the pronouncing dictionary
                    slang.append(word)
                    line_phones = line_phones + word

                poem_phonemes.append(line_phones)
            poems.append(poem_phonemes)

        #poem_phonemes.append(poem_phones)

    return poem_phonemes, slang 
phones, slang = poems_to_phonemes_str(poems_clean)
phones[0]

In [86]:
import string
def clean_old(poems):
   
    '''Takes a list of poems, 
    Returns a list of poems with punctuation removed from each poem
    Make all words lowercase '''
    
    poems_clean = []
    for poem in poems:
        poem_clean = []
        for line in poem:
            poem_no_punc = line.translate(str.maketrans('','',string.punctuation)).rstrip().lower()
            poem_clean.append(poem_no_punc)

            
        poems_clean.append(poem_clean)

    return poems_clean

poems_clean = clean_old(poems_raw)
poems_clean[0]

['from fairest creatures we desire increase',
 'that thereby beautys rose might never die',
 'but as the riper should by time decease',
 'his tender heir might bear his memory',
 'but thou contracted to thine own bright eyes',
 'feedst thy lightst flame with selfsubstantial fuel',
 'making a famine where abundance lies',
 'thyself thy foe to thy sweet self too cruel',
 'thou that art now the worlds fresh ornament',
 'and only herald to the gaudy spring',
 'within thine own bud buriest thy content',
 'and tender churl makest waste in niggarding',
 'pity the world or else this glutton be',
 'to eat the worlds due by the grave and thee']

In [87]:
def poems_words_to_ph(poems):
    phonemes = []
    slang = []
    for poem in poems:
        
        poem_phonemes = []
        poem_slang = []
        
        for word in poem:
            p = prn.phones_for_word(word)
            if len(p) == 1: # there is only one pronunciation
                poem_phonemes.append(p[0])

            elif len(p) > 1: # there is more than one pronunciation
                # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                poem_phonemes.append(p[0]) 

            else: # the word is not in the pronouncing dictionary
                poem_slang.append(word)
                poem_phonemes.append(word)
                    
        
        phonemes.append(poem_phonemes)
        
        slang.append(poem_slang)
    
    return phonemes, slang
            
phones, slang = poems_words_to_ph(poems_list_words)
phones[0]

['EH1 F',
 'AA1 R',
 'OW1',
 'EH1 M',
 ' ',
 'EH1 F',
 'AH0',
 'AY1',
 'AA1 R',
 'IY1',
 'EH1 S',
 'T IY1',
 ' ',
 'S IY1',
 'AA1 R',
 'IY1',
 'AH0',
 'T IY1',
 'Y UW1',
 'AA1 R',
 'IY1',
 'EH1 S',
 ' ',
 'D AH1 B AH0 L Y UW0',
 'IY1',
 ' ',
 'D IY1',
 'IY1',
 'EH1 S',
 'AY1',
 'AA1 R',
 'IY1',
 ' ',
 'AY1',
 'EH1 N',
 'S IY1',
 'AA1 R',
 'IY1',
 'AH0',
 'EH1 S',
 'IY1',
 ' ',
 'T IY1',
 'EY1 CH',
 'AH0',
 'T IY1',
 ' ',
 'T IY1',
 'EY1 CH',
 'IY1',
 'AA1 R',
 'IY1',
 'B IY1',
 'W AY1',
 ' ',
 'B IY1',
 'IY1',
 'AH0',
 'Y UW1',
 'T IY1',
 'W AY1',
 'EH1 S',
 ' ',
 'AA1 R',
 'OW1',
 'EH1 S',
 'IY1',
 ' ',
 'EH1 M',
 'AY1',
 'JH IY1',
 'EY1 CH',
 'T IY1',
 ' ',
 'EH1 N',
 'IY1',
 'V IY1',
 'IY1',
 'AA1 R',
 ' ',
 'D IY1',
 'AY1',
 'IY1',
 ' ',
 'B IY1',
 'Y UW1',
 'T IY1',
 ' ',
 'AH0',
 'EH1 S',
 ' ',
 'T IY1',
 'EY1 CH',
 'IY1',
 ' ',
 'AA1 R',
 'AY1',
 'P IY1',
 'IY1',
 'AA1 R',
 ' ',
 'EH1 S',
 'EY1 CH',
 'OW1',
 'Y UW1',
 'EH1 L',
 'D IY1',
 ' ',
 'B IY1',
 'W AY1',
 ' ',
 'T IY1',


In [None]:
class Poems2():
    
    def __init__():
        self.poems = []
        self.lines = []
        self.words = []
        self.stresses = []
        self.phonemes = []
        self.slang = []
        
    def to_lines(self):
        pass
        return self.lines
    
    def to_words(self):
        pass
        return self.words
    
     def to_stresses(self):
        pass
        return self.stresses
    
    def to_phonemes(self):
        pass
        return self.phonemes, self.slang
    


In [None]:
class Poems(poems):
    
    def __init__(self, poems):
        self.poems = poems
        self.lines = [[]]
        self.poem_words = []
        self.poem_phonemes = []
        self.slang = [] 
                  
    def text_to_poems(self, file_in):
        
        '''Takes a text file of poems, returns a list of poems with titles and no punctuation'''
        
        with open(file_in, 'r') as file:
            poems_raw = file.read().split('\n\n')

            # Poems and their titles got split up when parsing - reunite them
            for i in range(0, (int(len(poems_raw) - 1)), 2):
                poem_with_title = poems_raw[i] + " " + poems_raw[i+1]
                self.poems_titled.append(poem_with_title)

        return self.poems_titled
         
    def remove_punctuation(self, poems_punc):

        '''Takes a list of poems, and returns a list of poems with punctuation removed from each poem'''

        for poem in poems_punc:
            poem_phonemes = []
            poem_no_punc = poem.translate(str.maketrans('','',string.punctuation))
            self.poems_no_punc.append(poem_no_punc)

        return self.poems_no_punc

    def poems_to_words(self, poems):
        
        '''Make each line into a list of word and mke all words lowercase. Returns list of lists '''
        for poem in poems:
            lines = list(map(lambda x:x.lower(), poem.split('\n')))
            poem_word_list = [l.split() for l in lines]
            self.poem_words.append(poem_word_list)

        return self.poem_words
    
    def poem_to_phonemes(self, lines):
        '''Make all words lowercase and make each line into a list of word. Returns list of lists '''
        for poem in poems:
            poem_phones = []
            
            for line in poem:
                line_phones = ""
                for word in line:
                    p = prn.phones_for_word(word)

                    if len(p) == 1:
                        for item in p:
                            line_phones = line_phones + " " + item
                    elif len(p) > 1:
                        # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                        for item in p:
                            line_phones = line_phones + " " + item
                    else: # the word is not in the pronouncing dictionary
                        self.lang.append(word)
                        line_phones = line_phones + word

                poem_phones.append(line_phones)

            phonemes.append(poem_phones)

        return self.poem_phonemes
    
    def poems_to_phonemes(self, poems):    
        for poem in poems:
            poem_phones = []
            for line in poem:
                line_phones = ""
                for word in line:
                    p = prn.phones_for_word(word)

                    if len(p) == 1:
                        for item in p:
                            line_phones = line_phones + " " + item
                    elif len(p) > 1:
                        # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                        for item in p:
                            line_phones = line_phones + " " + item
                    else: # the word is not in the pronouncing dictionary
                        self.slang.append(word)
                        line_phones = line_phones + word

                poem_phones.append(line_phones)

            self.poem_phonemes.append(poem_phones)

        return self.poem_phonemes, self.slang   

In [None]:
import itertools
def cos_dist_phon_cts(phon_counts):
    k = np.zeros((153, 153))
    for a, b in itertools.combinations(phon_counts, 2):
        c = scipy.spatial.distance.cosine(a, b)
        k[a, b] = c
    
    return k
        
a = cos_dist_phon_cts(phon_cts)
a

In [None]:
from Bio import Phylo
ph_tree = Phylo.read('newick_words.txt', "newick")
Phylo.draw(ph_tree)