In [1]:
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image
import time
import math
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import ipywidgets as widgets
import plotly.graph_objects as go
from ipywidgets import Layout

from gensim.models import Word2Vec

from nltk.cluster import KMeansClusterer
import nltk  
from sklearn import cluster
from sklearn import metrics
import gensim.downloader as api

Load googles w2v model.

In [2]:
w2v = api.load('word2vec-google-news-300')

Save the model.

In [3]:
#import tempfile

#with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
#    temporary_filepath = tmp.name
#    w2v.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
#    new_model = Word2Vec.load(temporary_filepath)

In [4]:
DF = pd.read_csv("philosophy_data.csv")

DF["NumOfWords"]=DF["sentence_str"].apply(lambda x: len(x.split(" ")))
DF.head()

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str,NumOfWords
0,Plato - Complete Works,Plato,plato,"What's new, Socrates, to make you leave your ...","What's new, Socrates, to make you leave your ...",-350,1997,125,"what's new, socrates, to make you leave your ...","['what', 'new', 'socrates', 'to', 'make', 'you...","what be new , Socrates , to make -PRON- lea...",24
1,Plato - Complete Works,Plato,plato,Surely you are not prosecuting anyone before t...,Surely you are not prosecuting anyone before t...,-350,1997,69,surely you are not prosecuting anyone before t...,"['surely', 'you', 'are', 'not', 'prosecuting',...",surely -PRON- be not prosecute anyone before ...,13
2,Plato - Complete Works,Plato,plato,The Athenians do not call this a prosecution b...,The Athenians do not call this a prosecution b...,-350,1997,74,the athenians do not call this a prosecution b...,"['the', 'athenians', 'do', 'not', 'call', 'thi...",the Athenians do not call this a prosecution ...,12
3,Plato - Complete Works,Plato,plato,What is this you say?,What is this you say?,-350,1997,21,what is this you say?,"['what', 'is', 'this', 'you', 'say']",what be this -PRON- say ?,5
4,Plato - Complete Works,Plato,plato,"Someone must have indicted you, for you are no...","Someone must have indicted you, for you are no...",-350,1997,101,"someone must have indicted you, for you are no...","['someone', 'must', 'have', 'indicted', 'you',...","someone must have indict -PRON- , for -PRON- ...",19


Extract the tokenized_txt DF["tokenized_txt"] sentences and append them into a python list.

In [5]:
def collect_sentences_from_tokenized_txt(sentences, tokenized_txt):
  for i, sentence in enumerate(tokenized_txt):
    sentences.append(sentence)
  #remove the extra "" that were automatically added when appending?
  return sentences

In [6]:
sentences = []
collect_sentences_from_tokenized_txt(sentences, DF["tokenized_txt"])
sentences[:5]

["['what', 'new', 'socrates', 'to', 'make', 'you', 'leave', 'your', 'usual', 'haunts', 'in', 'the', 'lyceum', 'and', 'spend', 'your', 'time', 'here', 'by', 'the', 'king', 'archon', 'court']",
 "['surely', 'you', 'are', 'not', 'prosecuting', 'anyone', 'before', 'the', 'king', 'archon', 'as', 'am']",
 "['the', 'athenians', 'do', 'not', 'call', 'this', 'prosecution', 'but', 'an', 'indictment', 'euthyphro']",
 "['what', 'is', 'this', 'you', 'say']",
 "['someone', 'must', 'have', 'indicted', 'you', 'for', 'you', 'are', 'not', 'going', 'to', 'tell', 'me', 'that', 'you', 'have', 'indicted', 'someone', 'else']"]

Apply and train our own Word2Vec to sentences (Doc2Vec also exists in gensim.models)

In [7]:
#model = Word2Vec(sentences = sentences, min_count = 1)

Apply the already trained Word2Vec model from Google News dataset that contains about 100 billion words.
There's possibility to train our own model on our own data but the results might be worse.

In [8]:

for index, word in enumerate(w2v.index_to_key):
  if index == 10:
    break
  print("word #{}/{} is {}".format(index, len(w2v.index_to_key), word))

pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, w2v.similarity(w1, w2)))

print(w2v.most_similar(positive=['car', 'minivan'], topn=5))
print(w2v.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said
'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06
[('SUV', 0.8532192707061768), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763688564300537), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565720081329346)]
car


This function summarizes each vectorized word in a sentence and divides the sum with the amount of words in the sentence.
Doing this will allow us to be able to get whole sentences vectorized values instead of individual words.

In [9]:
def sentence_vectorizer(sentence, model):
    words = []
    num_words = 0
    for w in sentence:
        try:
            if num_words == 0:
                words = model[w]
            else:
                words = np.add(words, model[w])
        except:
            pass
        num_words += 1
    return np.asarray(words) / num_words

Use the above function with our model (in this case w2v) sentences.

In [10]:

X = []
for sentence in sentences:
    X.append(sentence_vectorizer(sentence, w2v))

print(X[:5])

[array([], dtype=float64), array([], dtype=float64), array([], dtype=float64), array([], dtype=float64), array([], dtype=float64)]
