In [26]:
from collections import defaultdict

import codecs
import gensim
import networkx as nx
import numpy as np

from bs4 import BeautifulSoup

In [11]:
G = nx.read_gexf("datasets/influences.gexf")
names = [ n for n in G.nodes() ]

In [100]:
def philosopher2vec(name):
  def get_text(name):
    txt = codecs.open("datasets/pages/%s.html" % name,"r", "utf-8-sig").read()

    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()

    return txt
  
  def tokenize(sentences):
    stoplist = set(codecs.open("datasets/stopwords.txt","r", "utf-8-sig").read().split())
    texts = [[word.replace(",", "").replace("'s", "") for word in document.lower().split() if word not in stoplist] for document in sentences]

    frequency = defaultdict(int)

    for text in texts:
      for token in text:
        frequency[token] += 1
    
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    
    return texts
  
  def to_dict(tokens):
    d = gensim.corpora.Dictionary(tokens)

    return d
  
  def to_corpus(d, t):
    c = [d.doc2bow(text) for text in t]
    
    return c

  txt = get_text(name)
  sentences = txt.split(".")
  
  tokens = tokenize(sentences)
  dictionary = to_dict(tokens)
  
  corpus = to_corpus(dictionary, tokens)

  bigrams = gensim.models.Phrases(tokens)

  lsi = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=300)
  word2vec = gensim.models.Word2Vec(bigrams[tokens], size=100, window=5, min_count=5, workers=4)

  # print tokens
  
  print name
  print "sentences: %s"% len(sentences)
  print "tokens: %s" % int(np.sum([ len(t) for t in tokens ]))
  print dictionary
  # print bigrams
  # print lsi.print_topics(10)
  print
  
  print "word2vec(work + philosophy) = "
  print "------------------------------"
  for (u,s) in word2vec.most_similar(positive=['work', "philosophy"]):
    print "%s: %s" % (u,s)
    
  print

#names = names[0:10]
names = ["Karl_Popper", "Martin_Heidegger", "Ludwig_Wittgenstein", "Bruno_Latour"]
# names = ["Ludwig_Wittgenstein"]
  
[ philosopher2vec(name) for name in names ];



Karl_Popper
sentences: 923
tokens: 6753
Dictionary(1433 unique tokens: [u'', u'searle', u'writings', u'four', u'whose']...)

word2vec(work + philosophy) = 
------------------------------
popper: 0.742815852165
theory: 0.73952049017
scientific: 0.678596794605
science: 0.667265892029
history: 0.656752705574
may: 0.650360643864
social: 0.644129633904
logical: 0.634747862816
mind: 0.62707811594
theories: 0.62146794796

Martin_Heidegger




sentences: 1007
tokens: 8065
Dictionary(1550 unique tokens: [u'', u'limited', u'writings', u'bedeutungslehre', u'augustine']...)

word2vec(work + philosophy) = 
------------------------------
heidegger: 0.98455542326
critical: 0.967774987221
being: 0.965728580952
being_time: 0.963882267475
heidegger:: 0.960699319839
husserl: 0.958516895771
metaphysics: 0.956396341324
nader_el-bizri: 0.95593804121
published: 0.952707707882
influence: 0.952072918415

Ludwig_Wittgenstein




sentences: 1710
tokens: 10136
Dictionary(1903 unique tokens: [u'', u'searle', u'writings', u'augustine', u'linz[edit]']...)

word2vec(work + philosophy) = 
------------------------------
wittgenstein: 0.928377568722
theory: 0.9000390172
language: 0.875064074993
logical: 0.851612448692
ludwig: 0.843339800835
david: 0.84154176712
moore: 0.841473162174
university: 0.836565077305
cambridge: 0.833699047565
school: 0.833020567894

Bruno_Latour
sentences: 297
tokens: 1677
Dictionary(431 unique tokens: [u'', u'global', u'focus', u'steve', u'tours']...)

word2vec(work + philosophy) = 
------------------------------
(2005): 0.220820605755
press: 0.208609819412
sociology: 0.164911314845
society: 0.156753599644
b: 0.151382014155
theory: 0.147827029228
anthropology: 0.13843677938
laboratory: 0.137707024813
technology: 0.113852225244
life: 0.113830879331

