In [128]:
import pandas as pd
import string
import pronouncing as prn
from collections import defaultdict
import math

In [16]:
# Returns a list of poems, lowecase, no punctuation
def make_poems_list(file_in):
    with open(file_in, 'r') as file:
        poems_list = file.read().split('\n\n')
        # Get rid of poem titles, /n characters, and make characters lowercase
        poems = [item.translate(str.maketrans('\n',' ',string.punctuation)).lower() for item in poems_list[1::2]]
            
    return poems

poems_list = make_poems_list('sonnets.txt')

In [17]:
# Returns list of poems with phonemes instead of words, and list of slang words
def poems_to_phones(poems):
    poems_phones = []
    poems_slang = []
    
    for poem in poems:
        poem_phones = ""
        poem_slang = []
        
        words = poem.split()
        for word in words:
            p = prn.phones_for_word(word)
            if len(p) == 1: # there is only one pronunciation
                poem_phones = poem_phones + p[0] + " "
            elif len(p) > 1: # there is more than one pronunciation
                # TODO fix this hack by figuring out which pronunciation is best instead of just choosing the 1st
                poem_phones = poem_phones + p[0] + " "
            else: # the word is not in the pronouncing dictionary
                poem_slang.append(word)
                poem_phones = poem_phones + " "
        
        poems_slang.append(poem_slang)    
        poems_phones.append(poem_phones)
        
    return poems_phones, poems_slang

In [190]:
phonemes, slang = poems_to_phones(poems_list)
len(phonemes[27])

996

In [191]:
def count_phon_freq(phonemes):
    phone_freqs = []
    for i in range(len(phonemes)):
        wordSet = set(phonemes[i])
        wordDict = dict.fromkeys(wordSet, 0) 

        for word in phonemes[i]:
            wordDict[word]+=1

        phone_freqs.append(wordDict)
    return phone_freqs

phoneDict = count_phon_freq(phonemes)
len(phoneDict)

154

In [222]:
def count_phon_pair_freq(phonemes):
    phone_freqs = []
    pair_list = []
    for poem in phonemes:
        phone_list = str.split(poem, ' ')
        pair_list.append([phone_list[i] + phone_list[i+1] for i in range(len(phone_list)-1)])
        #print(pair_list[0])
        
        for i in range(len(pair_list)-1):
            wordSet = set(pair_list[i])
            wordDict = dict.fromkeys(wordSet, 0) 
            #print(wordDict)

            for word in pair_list[i]:
                wordDict[word]+=1

            phone_freqs.append(wordDict)
    
    return pair_list, phone_freqs
pairList, pairDict = count_phon_pair_freq(phonemes)
len(pairList), len(pairDict[5])

(154, 230)

In [137]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [138]:
def calculate_tf(pairDict, pairList):
    tf_list = []
    for i in range(len(pairList)-1):
        tf = computeTF(pairDict[i], pairList[i])
        tf_list.append(tf)
    return tf_list

l = calculate_tf(pairDict, pairList)   
l[0]

{'': 0.2,
 'AA1N': 0.4,
 'AA1R': 0.2,
 'AE0K': 0.2,
 'AE1M': 0.2,
 'AE1T': 0.4,
 'AE1Z': 0.2,
 'AH0': 0.2,
 'AH0B': 0.2,
 'AH0D': 0.2,
 'AH0F': 0.2,
 'AH0G': 0.4,
 'AH0L': 0.6,
 'AH0M': 0.2,
 'AH0N': 1.2,
 'AH0W': 0.6,
 'AH1D': 0.2,
 'AH1M': 0.2,
 'AH1N': 0.2,
 'AH1T': 0.4,
 'AO1D': 0.2,
 'AO1R': 0.4,
 'AW1DH': 0.4,
 'AW1K': 0.2,
 'AY1': 0.4,
 'AY1B': 0.2,
 'AY1DH': 0.2,
 'AY1ER0': 0.2,
 'AY1F': 0.2,
 'AY1K': 0.2,
 'AY1M': 0.2,
 'AY1N': 0.4,
 'AY1S': 0.2,
 'AY1T': 0.8,
 'AY1Z': 0.4,
 'AY2S': 0.2,
 'B': 0.2,
 'BAH1': 0.8,
 'BAY1': 0.6,
 'BEH1': 0.2,
 'BIY1': 0.2,
 'BR': 0.2,
 'CHER0': 0.2,
 'D': 0.2,
 'DAH0': 0.2,
 'DAO1': 0.2,
 'DAY1': 0.2,
 'DB': 0.2,
 'DDH': 0.2,
 'DER0': 0.4,
 'DH': 0.6,
 'DHAE1': 0.4,
 'DHAH0': 1.2,
 'DHAW1': 0.4,
 'DHAY1': 1.2,
 'DHAY2': 0.2,
 'DHEH1': 0.2,
 'DHIH1': 0.4,
 'DHIY1': 0.2,
 'DIH0': 0.4,
 'DIY0': 0.2,
 'DOW1': 0.2,
 'DT': 0.6,
 'DUW1': 0.2,
 'DZ': 0.4,
 'EH0N': 0.2,
 'EH1L': 0.6,
 'EH1M': 0.2,
 'EH1N': 0.4,
 'EH1R': 1.2,
 'EH1SH': 0.2,
 'EH1V': 0.2,
 

In [161]:
idfDict = pairDict[0].keys()
#df = pd.DataFrame(idfDict)
df = pd.DataFrame.from_dict(idfDict)

ValueError: DataFrame constructor not properly called!

352

In [172]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    #print(docList[0].keys())
    all_existing_pairs = set(union(pairList))
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    l = [docList[i].keys() for i in range(len(docList)-1)]
    #print(l[])
    print(idfDict)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        #idfDict[word] = math.log10(N / float(val))
        idfDict[word] = math.log10(N / float(val))
        idfDict[word] = math.log10(N / float(val))
        #idfDictLinear[word] = (N / float(val))
        
    return idfDict

computeIDF(pairDict)

{'': 0, 'NGAH0': 0, 'SW': 0, 'AH0': 0, 'NAH0': 0, 'AY1N': 0, 'EY1K': 0, 'ZDH': 0, 'AA1R': 0, 'IY0B': 0, 'BEH1': 0, 'AH1T': 0, 'WER1': 0, 'MAY1': 0, 'AH1M': 0, 'LDH': 0, 'DZ': 0, 'ER0IH2': 0, 'IH0S': 0, 'LAY1': 0, 'DB': 0, 'NL': 0, 'AH0M': 0, 'DAH0': 0, 'IH1T': 0, 'DHAY1': 0, 'EH1M': 0, 'BR': 0, 'MAH0': 0, 'DHIH1': 0, 'DHAH0': 0, 'MW': 0, 'ZD': 0, 'F': 0, 'EH1V': 0, 'TAY1': 0, 'WIH0': 0, 'ER0IY0': 0, 'ER1L': 0, 'AH1D': 0, 'ST': 0, 'AY1Z': 0, 'NS': 0, 'TK': 0, 'LM': 0, 'LF': 0, 'AY1S': 0, 'IH0N': 0, 'R': 0, 'D': 0, 'HHEH1': 0, 'AY1B': 0, 'IY1CH': 0, 'AY1M': 0, 'EH1N': 0, 'FEH1': 0, 'FOW1': 0, 'WIH1': 0, 'WEH1': 0, 'TAE1': 0, 'HHIH1': 0, 'IH0Z': 0, 'FDH': 0, 'UW1IY1': 0, 'WIY1': 0, 'UH1D': 0, 'AE1T': 0, 'IH1DH': 0, 'YUW1': 0, 'UW1B': 0, 'AY1': 0, 'FY': 0, 'AO1D': 0, 'EY1M': 0, 'IH0NG': 0, 'DOW1': 0, 'FR': 0, 'DHAY2': 0, 'MF': 0, 'AA1N': 0, 'REY1': 0, 'DIY0': 0, 'IY0HH': 0, 'CHER0': 0, 'TIH0': 0, 'TEH1': 0, 'TAH0': 0, 'AY1K': 0, 'PR': 0, 'AH0G': 0, 'DAO1': 0, 'ZF': 0, 'IH1Z': 0, 'UW1DH': 0

KeyError: 'DL'

In [143]:
idfs = computeIDF([pairDict[i] for i in range(len(pairDict)-1)])

KeyError: 'DL'

In [173]:
#idfs = computeIDF([wordDictA, wordDictB])
def make_idfs_list(pairDict):
    idfs_list = []
    for i in range(len(pairDict)-1):
        idfs = computeIDF([pairDict[i] for i in range(len(pairDict)-1)])
        idfs_list.append(idfs)
        
    return idfs_list

make_idfs_list(pairDict)

{'': 0, 'NGAH0': 0, 'SW': 0, 'AH0': 0, 'NAH0': 0, 'AY1N': 0, 'EY1K': 0, 'ZDH': 0, 'AA1R': 0, 'IY0B': 0, 'BEH1': 0, 'AH1T': 0, 'WER1': 0, 'MAY1': 0, 'AH1M': 0, 'LDH': 0, 'DZ': 0, 'ER0IH2': 0, 'IH0S': 0, 'LAY1': 0, 'DB': 0, 'NL': 0, 'AH0M': 0, 'DAH0': 0, 'IH1T': 0, 'DHAY1': 0, 'EH1M': 0, 'BR': 0, 'MAH0': 0, 'DHIH1': 0, 'DHAH0': 0, 'MW': 0, 'ZD': 0, 'F': 0, 'EH1V': 0, 'TAY1': 0, 'WIH0': 0, 'ER0IY0': 0, 'ER1L': 0, 'AH1D': 0, 'ST': 0, 'AY1Z': 0, 'NS': 0, 'TK': 0, 'LM': 0, 'LF': 0, 'AY1S': 0, 'IH0N': 0, 'R': 0, 'D': 0, 'HHEH1': 0, 'AY1B': 0, 'IY1CH': 0, 'AY1M': 0, 'EH1N': 0, 'FEH1': 0, 'FOW1': 0, 'WIH1': 0, 'WEH1': 0, 'TAE1': 0, 'HHIH1': 0, 'IH0Z': 0, 'FDH': 0, 'UW1IY1': 0, 'WIY1': 0, 'UH1D': 0, 'AE1T': 0, 'IH1DH': 0, 'YUW1': 0, 'UW1B': 0, 'AY1': 0, 'FY': 0, 'AO1D': 0, 'EY1M': 0, 'IH0NG': 0, 'DOW1': 0, 'FR': 0, 'DHAY2': 0, 'MF': 0, 'AA1N': 0, 'REY1': 0, 'DIY0': 0, 'IY0HH': 0, 'CHER0': 0, 'TIH0': 0, 'TEH1': 0, 'TAH0': 0, 'AY1K': 0, 'PR': 0, 'AH0G': 0, 'DAO1': 0, 'ZF': 0, 'IH1Z': 0, 'UW1DH': 0

KeyError: 'DL'

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf