<a href="https://colab.research.google.com/github/yi-ye-zhi-qiu/metis-project4/blob/main/poemgenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
#import stuff

# NOTE: gutenberg MUST be run in a separate notebook, collab cannot handle gutenberg's necessary python 2.7 dependencies, unlike Jupyter
# ^^ for more, see: https://pypi.org/project/Gutenberg/  we cannot run the requisite "brew install berkeley-db4" command 


#TensorFlow
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

#SKLearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

#Other
# from gutenberg.acquire import load_etext
# from gutenberg.cleanup import strip_headers
import pandas as pd
import numpy as np 
import re

In [None]:
#gutenberg is used to fetch book txt files, so long as we give it the book ID
#we will, in addition to that, feed in the # of lines we want to skip (think of it like skipping the first 10 pages),
#and give it a personal name

books = [
#   BookID, skip N lines
#   (26715, 1000, 'Victorian songs'),
#   (30235, 580, 'Baldwin collection'),
#   (35402, 710, 'Swinburne collection'),
#   (574, 15, 'Blake'),
#   (1304, 172, 'Bulchevys collection'),
#   (19221, 223, 'Palgrave-Pearse collection'),
#   (15553, 522, 'Knowles collection') ,
    (17650, 6605, 'The Sonnets, Triumphs, and Other Poems of Petrarch by Francesco Petrarca') 
#                                    Link: https://www.gutenberg.org/ebooks/17650
#                                    6605 is the # of lines that we skip to get 
#                                    to the poems.
]

Generate raw.txt, a file with *only* the poems of Petrarch extracted from the book "The 
Sonnets, Triumphs, and Other Poems of Petrarch by Francesco Petrarca"

In [None]:
#Save as raw.txt file
with open('data/poetry/raw.txt', 'w') as ofp:
  lineno = 0
  for (id_nr, toskip, title) in books:
    startline = lineno
    text = strip_headers(load_etext(id_nr, mirror='http://mirrors.xmission.com/gutenberg/')).strip()
    lines = text.split('\n')[toskip:]
    for line in lines:
      if (len(line) > 0 
          and line.upper() != line 
          and not re.match('.*[0-9]+.*', line)
          and len(line) < 50
         ): #skip Titles
        cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower()) #all lowercase, only letters
        ofp.write(cleaned)
        ofp.write('\n')
        lineno = lineno + 1 #next line
      else:
        ofp.write('\n')
    print('Wrote lines {} to {} from {}'.format(startline, lineno, title))

Wrote lines 0 to 10983 from The Sonnets, Triumphs, and Other Poems of Petrarch by Francesco Petrarca


In [9]:
#Connect to Google drive, where raw.txt is stored. 
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
#Now that we are connected to our gDrive, we can read in raw.txt
raw = pd.read_csv('/content/drive/My Drive/notebooks/raw.txt')
raw = raw.values.tolist()

print("Example data structure, array of arrays: \n",raw[0:10])

Example data structure, array of arrays: 
 [['of those sad sighs with which my heart i fed'], ['when early youth my mazy wanderings led '], ['fondly diverse from what i now appear '], ['from those by whom my various style is read '], ['not only pardon but perhaps a tear '], ['but now i clearly see that of mankind'], ['and self-reproach with frequent blushes teem '], ['while of my frenzy shame the fruit i find '], ["that the world's joy is but a flitting dream "], ["o ye who list in scatter'd verse the sound"]]


In [22]:
#Let's count the # of each word by running a quick tokenizer
tokenizer = Tokenizer()

#Array of arrays -> array
corpus = [','.join(x) for x in raw]

#Fit tokenizer
tokenizer.fit_on_texts(corpus)

#Compensate for len -1
total_words = len(tokenizer.word_index) + 1

print('There are ',total_words, ' unique words in this set of Petrarchan poetry.')
print('Here are some the word indices',tokenizer.word_index)

There are  9009  unique words in this set of Petrarchan poetry.


In [56]:
#Define corpus as just one long list as opposed to list of lists?
corpus = [','.join(x) for x in raw]

#Instantiate count vectorizer
CV = CountVectorizer(ngram_range=(1, 1), stop_words='english')

#Fit to data
Vector = CV.fit_transform(corpus)

#LDA
lda = LatentDirichletAllocation(n_components=5)
LDA_Array = lda.fit_transform(Vector)

#NMF
nmf = NMF(n_components=2, init='random', random_state=0)
W = nmf.fit_transform(Vector)
t = nmf.components_


print(Vector.shape[0],'poems analyzed by CountVectorizer')
print(t.shape[1], 'poems analyzed by NMF')
print(len(LDA_Array),'poems analyzed by LDA')

10982 poems analyzed by CountVectorizer
8217 poems analyzed by NMF
10982 poems analyzed by LDA


In [62]:
features = CV.get_feature_names()

#Words from NMF
a = t.argsort(axis=1)[:,-1:-7:-1]
nmf_words = [[features[e] for e in l] for l in a]
print('nmf poem topic words \n', nmf_words)

#Words from LDA
b = lda.components_.argsort(axis=1)[:,-1:-7:-1]
lda_words = [[words[e] for e in l] for l in b]
print('lda poem topic words \n', lda_words)



nmf poem topic words 
 [['love', 'long', 'cruel', 'death', 'fortune', 'mind'], ['heart', 'thy', 'life', 'er', 'day', 'eyes']]
lda poem topic words 
 [['love', 'day', 'heaven', 'er', 'fair', 'ah'], ['death', 'life', 'fate', 'fair', 'eyes', 'time'], ['soul', 'hope', 'grief', 'like', 'pity', 'light'], ['heart', 'love', 'thy', 'sun', 'eyes', 'thou'], ['sad', 'er', 'joy', 'man', 'woe', 'far']]


In [None]:
with open('data/poetry/raw.txt', 'r') as rawfp,\
 open('data/poetry/input.txt', 'w') as infp,\
 open('data/poetry/output.txt', 'w') as outfp:
   
   prev_line = ''
   for curr_line in rawfp:
       curr_line = curr_line.strip()
       # poems break at empty lines, so this ensures we train only
       # on lines of the same poem
       if len(prev_line) > 0 and len(curr_line) > 0:       
           infp.write(prev_line + '\n')
           outfp.write(curr_line + '\n')
       prev_line = curr_line