In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
poems = pd.read_csv('data/poems.csv')
poems.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [3]:
def clean_poem(poem):
    poem = poem.replace('\r\n', ' ')
    pieces = [re.sub(r'[^a-zA-Z]', '', x).lower() for x in poem.split(' ')]
    return ' '.join(pieces)


In [4]:
poems['cleaned'] = list(map(clean_poem, poems['content']))
poems.head()

Unnamed: 0,author,content,poem name,age,type,cleaned
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore,let the bird of loudest lay on the sole arabia...
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore,sir charles into my chamber coming in when i w...
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore,our vice runs beyond all that old men saw and ...
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore,lo i the man whose muse whilome did maske as t...
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore,long have i longd to see my love againe still ...


In [5]:
poems_modern = poems[poems['age'] == 'Modern']
poems_modern = poems_modern.sample(n = 5)

poems_ren = poems[poems['age'] == 'Renaissance']
poems_ren = poems_ren.sample(n = 5)
# poems_ren = poems_ren.iloc[:5]

poems_all = pd.concat([poems_modern, poems_ren])
poems_all.drop(['content'], axis = 1)

Unnamed: 0,author,poem name,age,type,cleaned
412,SARA TEASDALE,September Midnight,Modern,Nature,originally published in poetry march
406,WALLACE STEVENS,The Idea of Order at Key West,Modern,Nature,wallace stevens the idea of order at key west ...
448,D. H. LAWRENCE,Gloire de Dijon,Modern,Nature,when she rises in the morning i linger to watc...
536,EZRA POUND,The River-Merchants Wife: A Letter,Modern,Love,while my hair was still cut straight across my...
344,HART CRANE,At Melvilles Tomb,Modern,Mythology & Folklore,hart crane at melvilles tomb from the complete...
45,QUEEN ELIZABETH I,Written in her French Psalter,Renaissance,Nature,no crooked leg no bleared eye no part deformed...
311,SIR THOMAS WYATT,What should I Say,Renaissance,Love,what should i say since faith is dead and trut...
299,WILLIAM SHAKESPEARE,Sonnet 129: Th'expense of spirit in a waste of...,Renaissance,Love,th expense of spirit in a waste of shame is lu...
248,JOHN DONNE,Elegy IX: The Autumnal,Renaissance,Love,no spring nor summer beauty hath such grace ...
72,WILLIAM SHAKESPEARE,Sonnet 34: Why didst thou promise such a beaut...,Renaissance,Love,why didst thou promise such a beauteous day an...


In [32]:
wordToId = {}

vectorized = []
counter = 0
stopwords = set(('the', 'and', 'or', 'a', 'i', 'of', 'to', 'from', 'do', 'did', 'in', '', 'is', 'as', 'my', 'for', 'on'))

for poem in poems_all['cleaned']:
    for word in poem.split(' '):
        if word not in wordToId and word not in stopwords:
            wordToId[word] = counter
            counter += 1

# Print the first 5 items
list(wordToId.items())[:5]

[('originally', 0),
 ('published', 1),
 ('poetry', 2),
 ('march', 3),
 ('wallace', 4)]

In [33]:
freq = np.zeros((10, len(wordToId)))

row_num = 0
for poem in poems_all['cleaned']:
    for word in poem.split(' '):
        if word in stopwords:
            continue
        freq[row_num, wordToId[word]] += 1;
    row_num += 1

for i in range(freq.shape[0]):
    closest = 0 if i != 0 else 1
    highest_sim = 0
    
    for j in range(freq.shape[0]):
        if i == j:
            continue
        intersect = ((freq[i, :] > 0) & (freq[j, :] > 0)).sum()
        union = ((freq[i, :] > 0) | (freq[j, :] > 0)).sum()
        jaccard = intersect / union
        if jaccard > highest_sim:
            highest_sim = jaccard
            closest = j

    print('Closest row for "{0}" ({2}) is "{1}" ({3})'.format( \
        poems_all.iloc[i]['poem name'], \
        poems_all.iloc[closest]['poem name'], \
        poems_all.iloc[i]['type'], \
         poems_all.iloc[closest]['type']))


Closest row for "September Midnight" (Nature) is "The Idea of Order at Key West" (Nature)
Closest row for "The Idea of Order at Key West" (Nature) is "At Melvilles Tomb" (Mythology & Folklore)
Closest row for "Gloire de Dijon" (Nature) is "Elegy IX: The Autumnal" (Love)
Closest row for "The River-Merchants Wife: A Letter" (Love) is "Elegy IX: The Autumnal" (Love)
Closest row for "At Melvilles Tomb" (Mythology & Folklore) is "The Idea of Order at Key West" (Nature)
Closest row for "Written in her French Psalter" (Nature) is "What should I Say" (Love)
Closest row for "What should I Say" (Love) is "Written in her French Psalter" (Nature)
Closest row for "Sonnet 129: Th'expense of spirit in a waste of shame" (Love) is "Sonnet 34: Why didst thou promise such a beauteous day" (Love)
Closest row for "Elegy IX: The Autumnal" (Love) is "Sonnet 34: Why didst thou promise such a beauteous day" (Love)
Closest row for "Sonnet 34: Why didst thou promise such a beauteous day" (Love) is "Sonnet 129: T

In [35]:
total_freq = freq.sum(axis = 0)
for x in sorted(wordToId.items(), key=lambda x: total_freq[x[1]], reverse=True)[:100]:
    print('{}: {}'.format(x[0], total_freq[x[1]]))

you: 17.0
but: 13.0
not: 12.0
her: 11.0
at: 10.0
no: 10.0
by: 9.0
with: 9.0
be: 9.0
yet: 8.0
here: 8.0
this: 7.0
that: 7.0
love: 7.0
out: 6.0
are: 6.0
me: 6.0
so: 6.0
all: 6.0
he: 6.0
she: 5.0
they: 5.0
if: 5.0
which: 5.0
still: 4.0
should: 4.0
have: 4.0
nor: 4.0
can: 4.0
shame: 4.0
such: 4.0
age: 4.0
whose: 4.0
copyright: 3.0
when: 3.0
down: 3.0
golden: 3.0
like: 3.0
roses: 3.0
their: 3.0
was: 3.0
we: 3.0
went: 3.0
being: 3.0
forever: 3.0
make: 3.0
hart: 3.0
crane: 3.0
since: 3.0
nay: 3.0
farewell: 3.0
though: 3.0
past: 3.0
had: 3.0
well: 3.0
doth: 3.0
name: 3.0
were: 3.0
than: 3.0
these: 3.0
graves: 3.0
loves: 3.0
where: 3.0
his: 3.0
things: 3.0
day: 3.0
thou: 3.0
thy: 3.0
wallace: 2.0
stevens: 2.0
west: 2.0
poems: 2.0
permission: 2.0
house: 2.0
window: 2.0
shoulders: 2.0
while: 2.0
mellow: 2.0
shadow: 2.0
glows: 2.0
yellow: 2.0
full: 2.0
fold: 2.0
it: 2.0
glory: 2.0
straight: 2.0
about: 2.0
gate: 2.0
playing: 2.0
living: 2.0
without: 2.0
never: 2.0
looked: 2.0
why: 2.0
far: 2.0
rive