In [116]:
import codecs
import random
import collections

import nltk
import pandas as pd
import networkx as nx
import sklearn

from bs4 import BeautifulSoup

## prepare data

- training set and cross-validation set
- compute features
- pre-filter

In [2]:
entries = [ "Plato", "Martin_Heidegger", "Ludwig_Wittgenstein", "Bruno_Latour", "René_Descartes", "Immanuel_Kant" ]

def build_corpus(entries):
  corpus = []

  for e in entries:
    txt = codecs.open("pages/%s.html" % e,"r", "utf-8-sig").read()

    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()

    sentences = txt.split(".")

    corpus.extend(sentences)
    
  return corpus

corpus = build_corpus(entries)

print len(corpus)

6970


In [3]:
random.shuffle(corpus)

### features

#### part-of-speech

In [4]:
tokens = []

def tokenize(sentence):
  tokens = []
  
  text_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
  
  return text_tagged

tokens = reduce(lambda a,b: a + b, map(tokenize, corpus))

# print tokens
print len(tokens)

104543


In [5]:
random.shuffle(tokens)

tokens[0:20]

[(u'Heidegger', 'NNP'),
 (u'Thoughts', 'NNS'),
 (u'Meta-', 'NNP'),
 (u'by', 'IN'),
 (u'of', 'IN'),
 (u'the', 'DT'),
 (u',', ','),
 (u'this', 'DT'),
 (u'Marijan', 'NNP'),
 (u'of', 'IN'),
 (u'as', 'IN'),
 (u'does', 'VBZ'),
 (u'works', 'NNS'),
 (u'theory', 'NN'),
 (u'German', 'JJ'),
 (u'Earlier', 'JJR'),
 (u'\xa738', 'NN'),
 (u')', ')'),
 (u'morphism', 'NN'),
 (u'Studies', 'NNS')]

In [6]:
[ parts, tags ] = zip(*tokens)

In [12]:
tags_classes = set(tags)

In [13]:
df = pd.DataFrame(columns = list(tags_classes))

for i in set(tokens):
  df.loc[i[0],i[1]] = 1

df.head()

Unnamed: 0,PRP$,VBG,VBD,``,VBN,POS,'',VBP,WDT,JJ,...,CD,EX,IN,WP$,MD,NNPS,JJS,JJR,SYM,UH
Exact,,,,,,,,,,,...,,,,,,,,,,
Nebular,,,,,,,,,,1.0,...,,,,,,,,,,
Liberty,,,,,,,,,,,...,,,,,,,,,,
ORB,,,,,,,,,,,...,,,,,,,,,,
Alexy,,,,,,,,,,,...,,,,,,,,,,


In [14]:
df = df.fillna(0)

print len(df)

14204


#### length

In [15]:
df["length"] = map(lambda x: len(x), df.index)

#### frequency

In [224]:
counter=collections.Counter(parts)
n = len(parts)

# print df.iloc[0:10,0].index
df["occurences"] = map(lambda x: float(counter[x]), df.index)
df["frequency"] = map(lambda x: float(counter[x])/float(n), df.index)

df.iloc[0:10,:]

Unnamed: 0,PRP$,VBG,VBD,``,VBN,POS,'',VBP,WDT,JJ,...,NNPS,JJS,JJR,SYM,UH,length,occurences,frequency,digits,~ name
Exact,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,1.0,1e-05,0,False
Nebular,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,7,1.0,1e-05,0,False
Liberty,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,7,5.0,4.8e-05,0,False
ORB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,1.0,1e-05,0,False
Alexy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,1.0,1e-05,0,True
devastating,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,11,1.0,1e-05,0,False
Note,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,3.0,2.9e-05,0,False
Dostoevsky,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,10,1.0,1e-05,0,False
corrections,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,11,3.0,2.9e-05,0,False
Bambach,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,7,1.0,1e-05,0,False


In [225]:
counter["the"]

3591

#### digits

In [226]:
df["digits"] = map(lambda w: sum([l.isdigit() for l in w]), df.index)

#### names

In [227]:
G = nx.read_gexf("influences.gexf")

names = reduce(lambda a,b: a + b, [ n.split("_") for n in G.nodes() ])
print names[0:20]

['Muhammad', 'Mustafa', 'Jauhar', 'Leonardo', 'Polo', 'Abd', 'al-Rahman', 'al-Kawakibi', u'Jaime', u'Guzm\xe1n', 'Jodi', 'Dean', 'Pietro', 'Verri', 'David', 'Dowty', u'Ivo', u'Urban\u010di\u010d', 'Raoul', 'Vaneigem']


In [228]:
df["~ name"] = map(lambda w: int(w in names), df.index)

## logistisc regression

### train a first model

In [229]:
stopwords_base = codecs.open("stopwords.txt", "r", "utf-8").read().split()
print len(stopwords_base)

67


In [230]:
print len( set(df.index.unique()) & set(stopwords_base) )

64


In [231]:
df2 = df.copy()

df2["stopword"] = 0

df2.loc[list(set(df.index.unique()) & set(stopwords_base)), ["stopword"]] = 1

print len(df2[df2["stopword"] == 1])

64


In [232]:
df2.head()

Unnamed: 0,PRP$,VBG,VBD,``,VBN,POS,'',VBP,WDT,JJ,...,JJS,JJR,SYM,UH,length,occurences,frequency,digits,~ name,stopword
Exact,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,1.0,1e-05,0,0,0
Nebular,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,7,1.0,1e-05,0,0,0
Liberty,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,7,5.0,4.8e-05,0,0,0
ORB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,1.0,1e-05,0,0,0
Alexy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,1.0,1e-05,0,1,0


In [233]:
train1 = df2[df2["stopword"] == 1]
train1 = train1.append(df2[df2["stopword"] == 0].iloc[0:128,:])

print len(train1)

192


In [234]:
codecs.open("stopwords.training.txt", "w", "utf-8").write("\n".join(train1.index))

In [235]:
features = [ c for c in train1.columns if c != "stopword" ]

X = train1.loc[:,features]
X_normalized = sklearn.preprocessing.normalize(X, norm='l1')
Y = train1["stopword"].as_matrix()

clf = sklearn.linear_model.LogisticRegression(penalty='l1', tol=1e-6, solver="liblinear")
clf.fit(X_normalized, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [236]:
cv1 = df2.loc[[ index for index in df2.index if index not in train1.index ],:] 
cv1 = cv1.iloc[0:1000,:]

cv1_ypredict = clf.predict(sklearn.preprocessing.normalize(cv1.loc[:, features]))

In [237]:
cv1["stopword_predict"] = cv1_ypredict

print len(cv1[cv1["stopword_predict"] == 1])

45


In [238]:
print cv1[cv1["stopword_predict"] == 1].index

Index([u']', u'René', u'26', u'after', u'work', u'Was', u'used', u'fact', u'O',
       u'most', u'44', u'London', u''', u'1991', u'd', u'la', u'knowledge',
       u'later', u'32', u'Works', u'Plato', u'22', u'These', u'11', u'France',
       u'The', u'G', u'New', u'told', u'45', u'y', u'end', u'”', u'up',
       u'form', u'Hitler', u'Vol', u'much', u'Athens', u'text', u'8', u'2012',
       u'If', u'–', u'argued'],
      dtype='object')


In [187]:
with codecs.open("stopwords.cv.generated.txt", "w", "utf-8") as f:
  f.write("\n".join(cv1[cv1["stopword_predict"] == 1].index))

In [188]:
with codecs.open("stopwords.cv.generated.n.txt", "w", "utf-8") as f:
  f.write("\n".join(cv1[cv1["stopword_predict"] == 0].index))

In [189]:
cv_fp = codecs.open("stopwords.cv.control-fp.txt", "r", "utf-8").read().split()
cv_fn = codecs.open("stopwords.cv.control-fn.txt", "r", "utf-8").read().split()

In [239]:
cv1["stopword"] = cv1["stopword_predict"]

cv1.loc[[ w for w in cv_fp if w in cv1.index],"stopword"] = 0
cv1.loc[[ w for w in cv_fn if w in cv1.index],"stopword"] = 1

In [240]:
train2 = train1.append(cv1)

X2 = train2.loc[:, features]
Y2 = train2["stopword"]

clf2 = sklearn.linear_model.LogisticRegression(penalty='l1', tol=1e-6, solver="liblinear")
clf2.fit(sklearn.preprocessing.normalize(X2), Y2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [241]:
cv2 = df2.loc[[ index for index in df2.index if index not in train2.index ],:] 
cv2 = cv2.iloc[0:1000,:]

cv2_ypredict = clf2.predict(sklearn.preprocessing.normalize(cv2.loc[:, features]))

In [247]:
cv2["stopword_predict"] = cv2_ypredict

print len(cv2[cv2["stopword_predict"] == 1])

77


In [248]:
print cv2[cv2["stopword_predict"] == 1].index

Index([u'good', u'1889-04-26', u'382–384', u'[', u'978-2-84066-557-1', u'984',
       u'6', u'027076164', u'374', u'et', u'de', u'reason', u'1889—1951',
       u'Descartes', u'231', u'9783832926045', u'1682–1746', u'Cambridge',
       u'jn19990009594', u'V', u'119404923', u'1911-1951', u'1904', u'277',
       u'our', u'135–145', u'1635–36', u'1890', u'Retrieved', u'ethics',
       u'ISBN', u'258–289', u'0-87220-349-2', u'1643', u'0-674-92905-5', u'—',
       u'800', u'1-4184-4977-6', u'167', u'19', u'2849', u'313', u'421',
       u'1950–51', u'''', u'because', u'978-1-930972-09-4', u'145–146',
       u'2013', u'0-486-41605-4', u'309', u'500248317', u'0-253-21800-4', u'§',
       u'H', u'0266-9080', u'This', u'2009', u'John', u'1927–1961', u'373–377',
       u'As', u'0-19-508645-7', u'His', u'Socrates', u'293–303', u'must',
       u'law', u'1745–47', u'978-1-930972-79-7', u'02694507X',
       u'0-271-02083-0', u'e', u'1889–1936', u'1936/7', u'ed', u'1750–58'],
      dtype='object')


In [249]:
print set(cv2[cv2["stopword_predict"] == 1].index) & set(cv_fp)

set([u'Socrates', u'good', u'reason', u'law', u'Descartes'])


In [250]:
final = train2.append(cv2)

print len(final)
print len(final[final["stopword"] == 1].index)

2192
142


In [251]:
cv2.loc[list(set(cv2[cv2["stopword_predict"] == 1].index) & set(cv_fp)),"stopword"] = 0

In [254]:
codecs.open("stopwords.final.txt", "w", "utf-8").write("\n".join(final[final["stopword"] == 1].index))

In [253]:
test = df.loc[[ w for w in df.index if w not in final.index], :]

test_ypredict = clf2.predict(sklearn.preprocessing.normalize(test.loc[:, features]))

test["stopword_predict"] = test_ypredict

print len(test[test["stopword_predict"] == 1])
print test[test["stopword_predict"] == 1].index

codecs.open("stopwords.test.p.txt", "w", "utf-8").write("\n".join(test[test["stopword_predict"] == 1].index))

805
Index([u'0-521-64836-X', u'see', u'1748', u'510', u'4025', u'1947–1951',
       u'180', u'und', u'28', u'W',
       ...
       u'1700', u'1615–16', u'0-691-02391-3', u'But', u'1634–35', u'am',
       u'119–24', u'9780674043237', u'1163/156852880x00025', u'135798642'],
      dtype='object', length=805)


14204


In [261]:
df3 = df.copy()

all_predict = clf2.predict(sklearn.preprocessing.normalize(df.loc[:, features]))

df3["stopword_predict"] = all_predict

codecs.open("stopwords.all.p.txt", "w", "utf-8").write("\n".join(df3[(df3["stopword_predict"] == 1) & (df3["~ name"] == False)].index))