In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [4]:
# load pickles
with open('clean_df.pkl', 'rb') as f:
    clean_poem_df = pickle.load(f)
    
with open('clean_line_df.pkl', 'rb') as g:
    clean_line_df = pickle.load(g)

In [6]:
clean_poem_df.head()

Unnamed: 0,author,title,content,line,length_in_lines,lexical_diversity
1,Hailey Leithauser,0,"Philosophic\nin its complex, ovoid emptiness,\...","[philosophic, in its complex ovoid emptiness, ...",15,0.863636
2,Jody Gladding,1-800-FEAR,We'd like to talk with you about fear t...,[wed like to talk with you about fear they sai...,11,0.663717
3,Joseph Brodsky,1 January 1965,The Wise Men will unlearn your name.\nAbove yo...,"[the wise men will unlearn your name, above yo...",24,0.693333
4,Ted Berrigan,3 Pages,For Jack Collom\n10 Things I do Every Day\n\np...,"[for jack collom, things i do every day, play ...",26,0.841463
5,Joe Brainard,30 One-Liners,WINTER\nMore time is spent at the window.\n\nS...,"[winter, more time is spent at the window, sum...",65,0.575843


In [25]:
from nltk.tokenize import word_tokenize
def prep_list(text):
    if isinstance(text, pd.Series):
        word_list = word_tokenize(' '.join())
    while isinstance(text, list):
        text = ' '.join([line for line in text])
        word_list = text.split()
    else:
        word_list = word_tokenize(text)
    return word_list

clean_poem_df['words'] = clean_poem_df.line.map(prep_list)

In [26]:
print(clean_poem_df.head())

              author           title  \
1  Hailey Leithauser               0   
2      Jody Gladding      1-800-FEAR   
3     Joseph Brodsky  1 January 1965   
4       Ted Berrigan         3 Pages   
5       Joe Brainard   30 One-Liners   

                                             content  \
1  Philosophic\nin its complex, ovoid emptiness,\...   
2  We'd  like  to  talk  with  you  about  fear t...   
3  The Wise Men will unlearn your name.\nAbove yo...   
4  For Jack Collom\n10 Things I do Every Day\n\np...   
5  WINTER\nMore time is spent at the window.\n\nS...   

                                                line  length_in_lines  \
1  [philosophic, in its complex ovoid emptiness, ...               15   
2  [wed like to talk with you about fear they sai...               11   
3  [the wise men will unlearn your name, above yo...               24   
4  [for jack collom, things i do every day, play ...               26   
5  [winter, more time is spent at the window, sum...     

In [33]:
tagged_poems = clean_poem_df.apply(lambda x: TaggedDocument(words=x.words, tags=x.author), axis=1)

In [35]:
tagged_poems.values[0]

TaggedDocument(words=['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discrete—a', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'noncogito'], tags='Hailey Leithauser')

In [31]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [36]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged_poems.values)])

100%|██████████| 15157/15157 [00:00<00:00, 3031764.31it/s]


In [39]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged_poems.values)]), total_examples=len(tagged_poems.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 15157/15157 [00:00<00:00, 3031619.73it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2527153.19it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2527153.19it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2731975.32it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526349.77it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3033500.30it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2525647.20it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031041.56it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526149.00it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526249.38it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2748511.27it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2527153.19it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3033789.82it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526048.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526149.00it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2525847.90it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2527253.66it/

Wall time: 4min 11s
