In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
# load pickles
with open('clean_df.pkl', 'rb') as f:
    clean_poem_df = pickle.load(f)
    
with open('clean_line_df.pkl', 'rb') as g:
    clean_line_df = pickle.load(g)

In [10]:
clean_poem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15157 entries, 1 to 15651
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   author             15157 non-null  category
 1   title              15157 non-null  object  
 2   content            15157 non-null  object  
 3   line               15157 non-null  object  
 4   length_in_lines    15157 non-null  int64   
 5   lexical_diversity  15157 non-null  float64 
 6   words              15157 non-null  object  
 7   id                 15157 non-null  int64   
dtypes: category(1), float64(1), int64(2), object(4)
memory usage: 1.1+ MB


In [61]:
from nltk.tokenize import word_tokenize
def prep_list(text):
    while isinstance(text, list):
        text = ' '.join([line for line in text])
        word_list = text.split()
    if isinstance(text, pd.Series):
        word_list = word_tokenize(' '.join())
    else:
        word_list = word_tokenize(text)
    return word_list

poems_in_words = clean_poem_df.line.map(prep_list)
clean_poem_df['words'] = poems_in_words

In [62]:
with open('poems_in_words.pkl', 'wb') as f:
    pickle.dump(poems_in_words, f)

In [42]:
elements = list(clean_poem_df.words)
ids = [str(i) for i in clean_poem_df.index]
print(len(ids))
print(elements[0], ids[0])

15157
['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discrete—a', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'noncogito'] 1


In [53]:
tagged_poems = [TaggedDocument(element,[i]) for i, element in enumerate(clean_poem_df.words.values)]

In [54]:
tagged_poems[0]

TaggedDocument(words=['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discrete—a', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'noncogito'], tags=[0])

In [55]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [56]:
model_dbow = Doc2Vec(vector_size = 2000,min_count = 0, dm = 0,
                     alpha=0.025, min_alpha=-0.0001, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged_poems)])

100%|██████████| 15157/15157 [00:00<00:00, 2525948.26it/s]


In [57]:
model_dbow.corpus_count

15157

In [58]:
%%time
for epoch in range(50):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged_poems)]), total_examples=model_dbow.corpus_count, epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 15157/15157 [00:00<00:00, 3032776.73it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3032632.05it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3030463.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3030897.05it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526048.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526149.00it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2527153.19it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526349.77it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031764.31it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031330.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3032053.50it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031475.17it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031330.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3032632.05it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031330.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 2526048.62it/s]
100%|██████████| 15157/15157 [00:00<00:00, 3031475.17it/

Wall time: 2min 51s


In [59]:
model_dbow.docvecs.vectors_docs.shape

(15157, 2000)

In [60]:
model_dbow.save("poem_doc2vec_dbow2000.pkl")

(10, 2000)

In [66]:
model_dbow.wv.most_similar('he')

[('unsorted', 0.09931951761245728),
 ('lycaeides', 0.09291273355484009),
 ('scotfree', 0.0894961878657341),
 ('fellowmen', 0.08882109820842743),
 ('hellion', 0.08766721189022064),
 ('sheers', 0.08732171356678009),
 ('cameraman', 0.08711902797222137),
 ('allureth', 0.086117222905159),
 ('abided', 0.08551956713199615),
 ('extenuating', 0.08515588939189911)]

In [72]:
from sklearn.manifold import TSNE
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# from gensim.models import KeyeVectors
# dictionary = KeyedVectors.laod_word2vec_format()
doc_tags = list(model_dbow.vectors.doctags.keys())
X = model_dbow[doc_tags]

AttributeError: 'Doc2Vec' object has no attribute 'vectors'