In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
# load df
with open('clean_df.pkl', 'rb') as f:
    clean_poem_df = pickle.load(f)
    
with open('clean_line_df.pkl', 'rb') as g:
    clean_line_df = pickle.load(g)

In [3]:
clean_poem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15076 entries, 1 to 15651
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   author             15076 non-null  category
 1   title              15076 non-null  object  
 2   content            15076 non-null  object  
 3   line               15076 non-null  object  
 4   length_in_lines    15076 non-null  int64   
 5   lexical_diversity  15076 non-null  float64 
 6   words              15076 non-null  object  
dtypes: category(1), float64(1), int64(1), object(4)
memory usage: 879.8+ KB


In [4]:
clean_line_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 468222 entries, 1 to 15651
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   author             468222 non-null  category
 1   title              468222 non-null  object  
 2   line               468222 non-null  object  
 3   words              468222 non-null  object  
 4   line_no            468222 non-null  int64   
 5   length_in_words    468222 non-null  int64   
 6   lexical_diversity  468222 non-null  float64 
dtypes: category(1), float64(1), int64(2), object(3)
memory usage: 25.9+ MB


In [5]:
elements = list(clean_poem_df.words)
ids = [str(i) for i in clean_poem_df.index]
print(len(ids))
print(elements[0], ids[0])

15076
['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discretea', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'noncogito'] 1


In [6]:
tagged_poems = [TaggedDocument(element,[i]) for i, element in enumerate(clean_poem_df.words.values)]

In [7]:
tagged_poems[0]

TaggedDocument(words=['philosophic', 'in', 'its', 'complex', 'ovoid', 'emptiness', 'a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort', 'of', 'stopgap', 'doorstop', 'for', 'those', 'quaint', 'equations', 'romans', 'never', 'dreamt', 'of', 'in', 'form', 'completely', 'clever', 'and', 'discretea', 'mirror', 'come', 'unsilvered', 'loose', 'watch', 'face', 'without', 'the', 'works', 'a', 'hollowed', 'globe', 'from', 'tip', 'to', 'toe', 'unbroken', 'it', 'evades', 'the', 'grappling', 'hooks', 'of', 'mass', 'tilts', 'the', 'thin', 'rim', 'of', 'no', 'thing', 'remains', 'embryonic', 'sum', 'noncogito'], tags=[0])

In [8]:
import multiprocessing
cores = multiprocessing.cpu_count()-1

In [9]:
model_dbow = Doc2Vec(vector_size = 500, min_count = 0, dm = 0,
                     alpha=0.025, min_alpha=-0.0001, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged_poems)])

100%|██████████| 15076/15076 [00:00<00:00, 2511551.30it/s]


In [10]:
model_dbow.corpus_count

15076

In [11]:
%%time
for epoch in range(50):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged_poems)]), total_examples=model_dbow.corpus_count, epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 15076/15076 [00:00<00:00, 3017001.15it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3017001.15it/s]
100%|██████████| 15076/15076 [00:00<00:00, 2512249.79it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3016713.28it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3016857.21it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3017721.06it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3016281.58it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3016425.47it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3015850.01it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3017001.15it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3015274.77it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3018297.24it/s]
100%|██████████| 15076/15076 [00:00<00:00, 2511950.39it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3018009.12it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3015418.56it/s]
100%|██████████| 15076/15076 [00:00<00:00, 3014268.62it/s]
100%|██████████| 15076/15076 [00:00<00:00, 2512848.80it/

Wall time: 1min 58s


In [12]:
model_dbow.docvecs.vectors_docs.shape

(15076, 500)

In [13]:
model_dbow.save("poem_doc2vec_dbow500.pkl")

In [14]:
model_dbow.wv.most_similar('he')
model_dbow.wv.most_similar('his')

[('trebuchet', 0.1938522458076477),
 ('oerbrimmd', 0.1936078816652298),
 ('ranchera', 0.18213686347007751),
 ('pennies', 0.1783159375190735),
 ('junkpile', 0.17791913449764252),
 ('streetlightlit', 0.17603012919425964),
 ('micheau', 0.17406262457370758),
 ('indite', 0.17216482758522034),
 ('mahlers', 0.17204627394676208),
 ('makerofsevens', 0.1718761920928955)]

In [15]:
# from sklearn.manifold import TSNE
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# # from gensim.models import KeyeVectors
# # dictionary = KeyedVectors.laod_word2vec_format()
# doc_tags = list(model_dbow.docvecs.doctags)
# X = model_dbow[doc_tags]