In [1]:
import os
from string import ascii_letters

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from modeling.mc_corpus import CorpusGraph
from modeling.utils import membership

In [2]:
def clean_documents(corpus, n=None, cutoff=None):
    
    corpus = corpus[:n].copy()

    for idx, doc in enumerate(corpus):

        doc = doc.replace('\n', ' ')
        doc = doc.replace('\t', ' ')
        doc = doc.replace('>>>>', ' ')
        
        doc = ''.join(d.lower() for d in doc if d in ascii_letters+' ')
        doc = ' '.join(d.strip() for d in doc.split(' '))
            
        corpus[idx] = doc[:cutoff]    
 
    return np.array(corpus)


In [3]:
def predict(graphs, documents):
    
    return np.argmax(
        [membership(graphs, doc) for doc in documents], 
        axis=1,
    )
    

---

In [4]:
seed = 42
#categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
categories = ['alt.atheism', 'soc.religion.christian']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=seed)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=seed)

In [5]:
X_train = clean_documents(twenty_train.data)
y_train = twenty_train.target

X_test = clean_documents(twenty_test.data)
y_test = twenty_test.target

del (twenty_train, twenty_test, )

---
### MC

In [9]:
G0 = CorpusGraph().add_documents(X_train[y_train==0])
G1 = CorpusGraph().add_documents(X_train[y_train==1])

In [10]:
y_train_pred = predict([G0, G1], X_train)
np.mean(y_train_pred==y_train)

0.9990732159406858

In [11]:
y_test_pred = predict([G0, G1], X_test)
np.mean(y_test_pred==y_test)

0.9316596931659693

In [12]:
G0.sample(n=1, stochastic=True, seed=0)

['from jaegerbuphybuedu gregg jaeger writes you might talk show signs whatever it like little to me wrong and one thing   the stories and therefore received courtesy in general that of the others and yes  however maternity leave on mark goes into a series of any of thinking what have values that it is also my claim to find a    the quran did you will name calling loans than those things which make possible ']

---
### Bayes

In [13]:
pipe = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [14]:
pipe.fit(X_train, y_train);

In [15]:
y_train_pred = pipe.predict(X_train)
np.mean(y_train_pred==y_train)

0.9499536607970342

In [16]:
y_test_pred = pipe.predict(X_test)
np.mean(y_test_pred==y_test)

0.7949790794979079

---
### SVM

In [17]:
pipe = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', LinearSVC()),
])

In [18]:
pipe.fit(X_train, y_train);

In [19]:
y_train_pred = pipe.predict(X_train)
np.mean(y_train_pred==y_train)

0.9990732159406858

In [20]:
y_test_pred = pipe.predict(X_test)
np.mean(y_test_pred==y_test)

0.9288702928870293

---