In [93]:
%matplotlib notebook  

import codecs
import random
import collections

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA 

import nltk
from gensim import corpora, models, similarities
from bs4 import BeautifulSoup

In [2]:
g = nx.read_gexf("datasets/influences.gexf")

In [3]:
corpus_entries = g.nodes()
random.shuffle(corpus_entries)
# corpus_entries = corpus_entries[0:1000]

In [4]:
stopwords = codecs.open("datasets/stopwords.final.txt", "r", "utf-8").read().split()

def load(name):
  txt = codecs.open("datasets/pages/%s.html" % name.replace("/", "-"),"r", "utf-8-sig").read()

  txt = BeautifulSoup(txt, "html.parser")
  txt = txt.get_text()
  
  return txt

def tokenize(doc):
  #text_tagged = nltk.pos_tag(nltk.word_tokenize(doc))
  words = doc.lower().split()

  #remove stopwords
  words = [ w for w in words if w not in stopwords ]
  #text_tagged = [ (w,t) for (w,t) in text_tagged if w not in stopwords ]
  
  counter = collections.Counter(words)
  
  tokens = [ w for w in counter if counter[w] > 5 ]

  return tokens
  
  
def process(name):
  txt = load(name)
  doc = tokenize(txt)
  
  return doc

corpus = [ process(name) for name in corpus_entries ]

In [5]:
dictionary = corpora.Dictionary(corpus)
print dictionary

Dictionary(24621 unique tokens: [u'schlegel', u'woods', u'francesco', u'1-55546-855-1.', u'jamiat']...)


In [9]:
training = [dictionary.doc2bow(doc) for doc in corpus]
# bigrams = models.Phrases(corpus)

lda = models.ldamodel.LdaModel(corpus=training, id2word=dictionary, num_topics=250, passes=20)

In [10]:
lda.save("datasets/lda-250topics.mm")

## Topics

In [121]:
topics = pd.DataFrame()

In [106]:
top_topics = lda.top_topics(training, num_words=10)

In [119]:
c = zip(*[ (" ".join([ "%s*%s" % (v,w) for v,w in values ]), score) for values, score in top_topics ])

In [122]:
topics["values"] = c[0]
topics["score"] = c[1]

In [123]:
topics.head()

Unnamed: 0,values,score
0,0.0267040800759*sees 0.0136910400111*anna-tere...,-5.441197
1,0.0466291279698*articles 0.020191731181*dummet...,-9.359197
2,0.0137053102754*statue 0.00720866825709*oakley...,-13.293532
3,0.00736206292372*should 0.00736206276254*vaila...,-16.039851
4,0.0258286053714*also 0.0257023362935*other 0.0...,-17.61568


In [125]:
topics.to_csv("datasets/lda250.topics.csv", encoding="utf-8")

## Visualization

In [54]:
philosophers_lda250 = pd.DataFrame(index=corpus_entries, columns=[ "lda_%s" % i for i in range(0,250) ])

def project(name):
  x = lda[dictionary.doc2bow(process(name))]
  
  if len(x)>0:
    (topics, values) = zip(*[ ("lda_%s" % t, v) for (t,v) in x ])
    philosophers_lda250.loc[name, topics] = values

#[ project(d) for d in ["Ludwig_Wittgenstein", "Meister_Eckhart"] ];
map(lambda name: project(name), philosophers_lda250.index)

philosophers_lda250 = philosophers_lda250.fillna(0)

philosophers_lda250.head()

Unnamed: 0,lda_0,lda_1,lda_2,lda_3,lda_4,lda_5,lda_6,lda_7,lda_8,lda_9,...,lda_240,lda_241,lda_242,lda_243,lda_244,lda_245,lda_246,lda_247,lda_248,lda_249
Meister_Eckhart,0.0,0.0,0.0,0.0,0.2922,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Onora_O'Neill,_Baroness_O'Neill_of_Bengarve",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Eric_Dunning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H._G._Wells,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bronisław_Malinowski,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
philosophers_lda250.to_csv("datasets/lda250.csv", encoding="utf-8")

In [85]:
M = philosophers_lda250.as_matrix()

In [86]:
pca = PCA()

Z = pca.fit(M).transform(M)
Z = zip(*Z)

In [96]:
fig = plt.figure()
plt.scatter(Z[0], Z[1])

plt.show()

<IPython.core.display.Javascript object>

In [103]:
from sklearn.cluster import KMeans

y_pred = KMeans(n_clusters=50, random_state=170).fit_predict(M)

In [136]:
fig = plt.figure()
plt.scatter(Z[0], Z[1], c=y_pred)

plt.show()

<IPython.core.display.Javascript object>

In [94]:
print len(pca.explained_variance_ratio_)

def plot_explained_vr(pca):
  (x, y) = zip(*list(enumerate(pca.explained_variance_ratio_)))
  
  y = [ np.sum(y[0:i]) for i, dummy in enumerate(y) ]
  
  fig = plt.figure()
  plt.plot(x, y)
  
  plt.show()

plot_explained_vr(pca)

250


<IPython.core.display.Javascript object>

In [138]:
from mpl_toolkits.mplot3d import Axes3D


fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(Z[0], Z[1], Z[2], c=y_pred)

plt.show()

<IPython.core.display.Javascript object>

## t-SNE

In [126]:
from sklearn.manifold import TSNE

In [127]:
tsne = TSNE(n_components=2)

z_tsne = tsne.fit_transform(M)

In [135]:
y_pred

array([20, 27, 27, ..., 27, 32,  0], dtype=int32)

In [137]:
fig = plt.figure()

z_tsne = zip(*z_tsne)

plt.scatter(z_tsne[0], z_tsne[1], c=y_pred)

plt.show()

<IPython.core.display.Javascript object>