In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
# load df
with open('tagged_poems_df.pkl', 'rb') as f:
    clean_poem_df = pickle.load(f)
    
with open('tagged_lines_df.pkl', 'rb') as g:
    clean_line_df = pickle.load(g)

In [3]:
clean_poem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14779 entries, 1 to 15651
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   author             14779 non-null  category
 1   title              14779 non-null  object  
 2   content            14779 non-null  object  
 3   line               14779 non-null  object  
 4   length_in_lines    14779 non-null  int64   
 5   lexical_diversity  14779 non-null  float64 
 6   words              14779 non-null  object  
 7   word_lengths       14779 non-null  object  
 8   max_word_length    14779 non-null  int64   
 9   pos_tags           14779 non-null  object  
dtypes: category(1), float64(1), int64(2), object(6)
memory usage: 1.2+ MB


In [4]:
clean_line_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 411540 entries, 1 to 453771
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   author             411540 non-null  category
 1   title              411540 non-null  object  
 2   line               411540 non-null  object  
 3   line_no            411540 non-null  int64   
 4   words              411540 non-null  object  
 5   length_in_words    411540 non-null  int64   
 6   lexical_diversity  411540 non-null  float64 
 7   word_lengths       411540 non-null  object  
 8   max_word_length    411540 non-null  int64   
 9   pos_tags           411540 non-null  object  
dtypes: category(1), float64(1), int64(3), object(5)
memory usage: 32.2+ MB


In [5]:
elements = list(clean_poem_df.words)
ids = [str(i) for i in clean_poem_df.index]
print(len(ids))
print(elements[0], ids[0])

14779
['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discretea', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'non-cogito'] 1


In [6]:
tagged_poems = [TaggedDocument(element,[i]) for i, element in enumerate(clean_poem_df.words.values)]

In [7]:
tagged_poems[0]

TaggedDocument(words=['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discretea', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'non-cogito'], tags=[0])

In [8]:
import multiprocessing
cores = multiprocessing.cpu_count()-1

In [9]:
model_dbow = Doc2Vec(vector_size = 500, min_count = 0, dm = 0,
                     alpha=0.025, min_alpha=-0.0001, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged_poems)])

100%|██████████| 14779/14779 [00:00<00:00, 2956719.24it/s]


In [10]:
model_dbow.corpus_count

14779

In [11]:
%%time
for epoch in range(50):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged_poems)]), total_examples=model_dbow.corpus_count, epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 14779/14779 [00:00<00:00, 2957283.47it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956437.20it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2463834.76it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956437.20it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2955450.50it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2955450.50it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956719.24it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2955450.50it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2955168.71it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956437.20it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956296.20it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2955591.42it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956719.24it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2956719.24it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2957989.06it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2957001.33it/s]
100%|██████████| 14779/14779 [00:00<00:00, 2957283.47it/

Wall time: 1min 54s


In [12]:
model_dbow.docvecs.vectors_docs.shape

(14779, 500)

In [13]:
model_dbow.save("poem_doc2vec_dbow500.pkl")

In [14]:
model_dbow.wv.most_similar('he')
model_dbow.wv.most_similar('his')

[('counterterrorism', 0.18991439044475555),
 ('avant', 0.18803800642490387),
 ('yakutat', 0.1750287115573883),
 ('replaced', 0.17184962332248688),
 ('retractable', 0.16939330101013184),
 ('picardie', 0.16750100255012512),
 ('haydn', 0.16565947234630585),
 ('pantalooned', 0.1646561324596405),
 ('weltgeist&amp', 0.16395975649356842),
 ('dark-winged', 0.16297659277915955)]

In [15]:
poem_vector=[]

t = 1000

for i in range(len(poems)):
    if i % t == 0:
        print("poem", i, ":", poems[i])
        print("***")
    poem = poems[i]
    poem_vector.append(doc2vec_model.infer_vector(poem))
    
#save the lines_vector
poem_vector_file = "poem_vector_2000.pkl"
with open(poem_vector_file, 'wb') as f:
    pickle.dump((poem_vector), f)