In [1]:
from collections import defaultdict

import codecs
import gensim
import networkx as nx
import numpy as np

from bs4 import BeautifulSoup

In [2]:
G = nx.read_gexf("datasets/influences.gexf")
names = [ n for n in G.nodes() ]

In [3]:
def philosopher2vec(name):
  def get_text(name):
    txt = codecs.open("datasets/pages/%s.html" % name,"r", "utf-8-sig").read()

    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()

    return txt
  
  def tokenize(sentences):
    stoplist = set(codecs.open("datasets/stopwords.txt","r", "utf-8-sig").read().split())
    texts = [[word.replace(",", "").replace("'s", "") for word in document.lower().split() if word not in stoplist] for document in sentences]

    frequency = defaultdict(int)

    for text in texts:
      for token in text:
        frequency[token] += 1
    
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    
    return texts
  
  def to_dict(tokens):
    d = gensim.corpora.Dictionary(tokens)

    return d
  
  def to_corpus(d, t):
    c = [d.doc2bow(text) for text in t]
    
    return c

  txt = get_text(name)
  sentences = txt.split(".")
  
  tokens = tokenize(sentences)
  dictionary = to_dict(tokens)
  
  corpus = to_corpus(dictionary, tokens)

  bigrams = gensim.models.Phrases(tokens)

  lsi = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=300)
  word2vec = gensim.models.Word2Vec(bigrams[tokens], size=100, window=5, min_count=5, workers=4, batch_words=100)

  # print tokens
  
  print name
  print "sentences: %s"% len(sentences)
  print "tokens: %s" % int(np.sum([ len(t) for t in tokens ]))
  print dictionary
  # print bigrams
  # print lsi.print_topics(10)
  print
  
  print "word2vec(work + philosophy) = "
  print "------------------------------"
  for (u,s) in word2vec.most_similar(positive=['work', "philosophy"]):
    print "%s: %s" % (u,s)
    
  print

#names = names[0:10]
names = ["Karl_Popper", "Martin_Heidegger", "Ludwig_Wittgenstein", "Bruno_Latour"]
# names = ["Ludwig_Wittgenstein"]
  
[ philosopher2vec(name) for name in names ];

Karl_Popper
sentences: 923
tokens: 6744
Dictionary(1432 unique tokens: [u'', u'searle', u'writings', u'four', u'whose']...)

word2vec(work + philosophy) = 
------------------------------
theory: 0.46767282486
popper: 0.410269141197
code: 0.39680737257
der: 0.385170042515
growth: 0.380040287971
social: 0.355461210012
scientific: 0.350787937641
cannot: 0.348469853401
may: 0.345063239336
science: 0.342136919498

Martin_Heidegger
sentences: 1007
tokens: 8058
Dictionary(1549 unique tokens: [u'', u'limited', u'writings', u'bedeutungslehre', u'augustine']...)

word2vec(work + philosophy) = 
------------------------------
heidegger: 0.883464396
being: 0.799358487129
husserl: 0.797548174858
theory: 0.785413324833
nader_el-bizri: 0.783440470695
being_time: 0.773824095726
published: 0.767580270767
die: 0.765618026257
account: 0.764146089554
thought: 0.761416018009

Ludwig_Wittgenstein
sentences: 1710
tokens: 10101
Dictionary(1902 unique tokens: [u'', u'searle', u'writings', u'augustine', u'linz[e