In [1]:
import os
from string import ascii_letters

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from modeling.model_api import MarkovChainCorpus

In [2]:
def clean_documents(corpus, n=None, cutoff=None):
    
    corpus = corpus[:n].copy()

    for idx, doc in enumerate(corpus):

        doc = doc.replace('\n', ' ')
        doc = doc.replace('\t', ' ')
        doc = doc.replace('>>>>', ' ')
        
        doc = ''.join(d.lower() for d in doc if d in ascii_letters+' ')
        doc = ' '.join(d.strip() for d in doc.split(' '))
            
        corpus[idx] = doc[:cutoff]    
 
    return np.array(corpus)


---

In [3]:
seed = 42
#categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
categories = ['alt.atheism', 'soc.religion.christian']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=seed)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=seed)

In [4]:
tr, te = 1000, 100
X_train = clean_documents(twenty_train.data)[:tr]
y_train = twenty_train.target[:tr]

X_test = clean_documents(twenty_test.data)[:te]
y_test = twenty_test.target[:te]

del (twenty_train, twenty_test, )

---
### MC

In [5]:
mcc = MarkovChainCorpus()

In [6]:
mcc.fit(X_train, y_train)

MarkovChainCorpus()

In [8]:
y_hat_train = mcc.predict(X_train)
np.mean(y_hat_train==y_train)

1.0

In [9]:
y_hat_test = mcc.predict(X_test)
np.mean(y_hat_test==y_test)

0.93

In [15]:
G = mcc.graphs_[0]
synth = G.sample(n=1, stochastic=True, seed=seed)
synth

['from an omniscience didnt say murder what im saying pretty obvious physical or should not generally held to key after   priest ceasing to the good fights organization case that theyd have       the answer key after jesus would certainly a bit into the quran in these other poor analysis of my faith  the atheists are objective values        all have been some evidence of hell lines   understand your sources if that i see if you must be an if we all of the classical lewis is the flood for and so   contradict the quran is often stated in plan we would do so then you heard this a matter of the executions of scripture be true because the sheets of comparable likelihood of it anyway        well which was to someone for the sign there was he explicitly reject god exists  when committing a french a smoldering wick     comprehensive but is travelling with here but abraham knew either a smile or eggs   on the goal  overview of a cage which it is a different view first i doubt    seems silly que

In [20]:
synth[0] in set(X_train), synth[0] in set(X_test)

(False, False)

---
### Bayes

In [21]:
bayes = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [22]:
bayes.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('bayes', MultinomialNB())])

In [23]:
y_hat_train = bayes.predict(X_train)
np.mean(y_hat_train==y_train)

0.956

In [24]:
y_hat_test = bayes.predict(X_test)
np.mean(y_hat_test==y_test)

0.75

---
### SVM

In [25]:
svm = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

In [26]:
svm.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('svc', LinearSVC())])

In [27]:
y_hat_train = svm.predict(X_train)
np.mean(y_hat_train==y_train)

0.999

In [28]:
y_hat_test = svm.predict(X_test)
np.mean(y_hat_test==y_test)

0.92

---