In [1]:
import os
from string import ascii_letters

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from modeling import MarkovEntropyClassification, SpectralNodeClustering, CorpusGraph

In [2]:
def clean_documents(corpus, n=None, cutoff=None):
    
    corpus = corpus[:n].copy()

    for idx, doc in enumerate(corpus):

        doc = doc.replace('\n', ' ')
        doc = doc.replace('\r', ' ')
        doc = doc.replace('\t', ' ')
        doc = doc.replace('>>>>', ' ')
        
        doc = ''.join(d.lower() for d in doc if d in ascii_letters+' ')
        
        doc = doc.replace('  ', ' ')
        doc = doc.replace('  ', ' ')
        doc = doc.replace('  ', ' ')
            
        corpus[idx] = doc[:cutoff]    
 
    return np.array(corpus)


In [3]:
seed = 42
#categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
categories = ['alt.atheism', 'soc.religion.christian']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=seed)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=seed)

In [4]:
tr, te = 1000, 100
X_train = clean_documents(twenty_train.data)[:tr]
y_train = twenty_train.target[:tr]

X_test = clean_documents(twenty_test.data)[:te]
y_test = twenty_test.target[:te]

del (twenty_train, twenty_test, )

---
---
---
# Classification

--- 
### MC

In [5]:
mcc = MarkovEntropyClassification()

In [6]:
mcc.fit(X_train, y_train)

MarkovEntropyClassification()

In [7]:
y_hat_train = mcc.predict(X_train)
np.mean(y_hat_train==y_train)

1.0

In [8]:
y_hat_test = mcc.predict(X_test)
np.mean(y_hat_test==y_test)

0.92

---
### Bayes

In [9]:
bayes = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [10]:
bayes.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('bayes', MultinomialNB())])

In [11]:
y_hat_train = bayes.predict(X_train)
np.mean(y_hat_train==y_train)

0.956

In [12]:
y_hat_test = bayes.predict(X_test)
np.mean(y_hat_test==y_test)

0.75

---
### SVM

In [13]:
svm = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

In [14]:
svm.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('svc', LinearSVC())])

In [15]:
y_hat_train = svm.predict(X_train)
np.mean(y_hat_train==y_train)

0.999

In [16]:
y_hat_test = svm.predict(X_test)
np.mean(y_hat_test==y_test)

0.92

---

In [17]:
del y_hat_train, y_hat_test, svm, bayes, mcc, 

---
---
---
# Generation

In [18]:
graph = CorpusGraph()

In [19]:
graph.add_documents(X_train)

<modeling.graph.CorpusGraph at 0x7ff420bbf7c0>

In [20]:
synth = graph.sample(stochastic=True, seed=seed)
synth

'from jcopelannyxcsduedu the messenger if angels are jerks for us all sound argument my mouth of him and distribution worldpublic organization sun select geoffarnoldeastsuncom what is possible answer to meet again secondhand and not need to be quite well now is presented as well doing they will not knowing how can be fair god by asserting that at the posting by having completed the unity of time it is this is in western reserve university of georgia usa lines frankdsuucp frank odwyer subject re some of egypt god it can be specific examples can give everyone for the fact believe as'

In [21]:
synth in set(X_train), synth in set(X_test)

(False, False)

---

In [22]:
del graph, synth, 

---
---
---
# Clustering

In [23]:
msvd = SpectralNodeClustering(max_clusters=10)

In [24]:
clusters_dict = msvd.fit_predict(X_train)

In [25]:
list(clusters_dict.items())[:10]

[('from', 6),
 ('nigelallencanremcom', 2),
 ('nigel', 5),
 ('allen', 6),
 ('subject', 6),
 ('library', 0),
 ('of', 1),
 ('congress', 6),
 ('to', 2),
 ('host', 6)]

---

In [26]:
del msvd, clusters_dict,

---
---
---