In [None]:
import os
from string import ascii_letters

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from modeling import MarkovEntropyClassification, MarkovSVDClustering, CorpusGraph

In [None]:
def clean_documents(corpus, n=None, cutoff=None):
    
    corpus = corpus[:n].copy()

    for idx, doc in enumerate(corpus):

        doc = doc.replace('\n', ' ')
        doc = doc.replace('\t', ' ')
        doc = doc.replace('>>>>', ' ')
        
        doc = ''.join(d.lower() for d in doc if d in ascii_letters+' ')
        doc = ' '.join(d.strip() for d in doc.split(' '))
            
        corpus[idx] = doc[:cutoff]    
 
    return np.array(corpus)


In [None]:
seed = 42
#categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
categories = ['alt.atheism', 'soc.religion.christian']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=seed)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=seed)

In [None]:
tr, te = 1000, 100
X_train = clean_documents(twenty_train.data)[:tr]
y_train = twenty_train.target[:tr]

X_test = clean_documents(twenty_test.data)[:te]
y_test = twenty_test.target[:te]

del (twenty_train, twenty_test, )

---
---
---
# Classification

--- 
### MC

In [None]:
mcc = MarkovEntropyClassification()

In [None]:
mcc.fit(X_train, y_train)

In [None]:
y_hat_train = mcc.predict(X_train)
np.mean(y_hat_train==y_train)

In [None]:
y_hat_test = mcc.predict(X_test)
np.mean(y_hat_test==y_test)

---
### Bayes

In [None]:
bayes = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [None]:
bayes.fit(X_train, y_train)

In [None]:
y_hat_train = bayes.predict(X_train)
np.mean(y_hat_train==y_train)

In [None]:
y_hat_test = bayes.predict(X_test)
np.mean(y_hat_test==y_test)

---
### SVM

In [None]:
svm = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

In [None]:
svm.fit(X_train, y_train)

In [None]:
y_hat_train = svm.predict(X_train)
np.mean(y_hat_train==y_train)

In [None]:
y_hat_test = svm.predict(X_test)
np.mean(y_hat_test==y_test)

---

In [None]:
del y_hat_train, y_hat_test, svm, bayes, mcc, 

---
---
---
# Generation

In [None]:
graph = CorpusGraph()

In [None]:
graph.add_documents(X_train)

In [None]:
synth = graph.sample(n=1, stochastic=True, seed=seed)
synth[0]

In [None]:
synth[0] in set(X_train), synth[0] in set(X_test)

---

In [None]:
del graph, synth, 

---
---
---
# Clustering

In [None]:
msvd = MarkovSVDClustering(max_clusters=10)

In [None]:
msvd.fit(X_train)

In [None]:
clusters_dict = msvd.predict()

In [None]:
list(clusters_dict.items())[:10]

---

In [None]:
del msvd, clusters_dict,

---
---
---