In [1]:
import os
from string import ascii_letters

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from modeling import MarkovEntropyClassification, SpectralNodeClustering, CorpusGraph

In [2]:
def clean_documents(corpus, n=None, cutoff=None):

    corpus = corpus[:n].copy()

    for idx, doc in enumerate(corpus):

        doc = doc.replace('\n', ' ')
        doc = doc.replace('\r', ' ')
        doc = doc.replace('\t', ' ')
        doc = doc.replace('>>>>', ' ')

        doc = ''.join(d.lower() for d in doc if d in ascii_letters+' ')

        doc = doc.replace('  ', ' ')
        doc = doc.replace('  ', ' ')
        doc = doc.replace('  ', ' ')

        corpus[idx] = doc[:cutoff]

    return np.array(corpus)


In [3]:
seed = 42
categories = ['alt.atheism', 'soc.religion.christian']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=seed)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=seed)

In [4]:
tr, te = 1000, 100
X_train = clean_documents(twenty_train.data)[:tr]
y_train = twenty_train.target[:tr]

X_test = clean_documents(twenty_test.data)[:te]
y_test = twenty_test.target[:te]

del (twenty_train, twenty_test, )

In [5]:
# Artificially imbalance classes
idx = np.hstack([
    np.where(y_train==0)[0][:200],
    np.where(y_train==1)[0],
])
X_train, y_train = X_train[idx], y_train[idx]

---
---
---
# Classification, imbalanced

--- 
### MC

In [6]:
mcc = MarkovEntropyClassification()

In [7]:
mcc.fit(X_train, y_train)

In [8]:
y_hat_train = mcc.predict(X_train)
np.mean(y_hat_train==y_train)

1.0

In [9]:
y_hat_test = mcc.predict(X_test)
np.mean(y_hat_test==y_test)

0.89

---
### Bayes

In [10]:
bayes = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [11]:
bayes.fit(X_train, y_train)

In [12]:
y_hat_train = bayes.predict(X_train)
np.mean(y_hat_train==y_train)

0.7768924302788844

In [13]:
y_hat_test = bayes.predict(X_test)
np.mean(y_hat_test==y_test)

0.57

---
### SVM

In [14]:
svm = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

In [15]:
svm.fit(X_train, y_train)

In [16]:
y_hat_train = svm.predict(X_train)
np.mean(y_hat_train==y_train)

0.99867197875166

In [17]:
y_hat_test = svm.predict(X_test)
np.mean(y_hat_test==y_test)

0.76

---

In [18]:
del y_hat_train, y_hat_test, svm, bayes, mcc,

---
---
---
# Classification, oversampling

---

In [19]:
np.bincount(y_train)

array([200, 553])

In [20]:
graph_tr = CorpusGraph()
graph_tr.add_documents(X_train[y_train==0])

<modeling.graph.CorpusGraph at 0x316f33ac0>

In [21]:
synth = [graph_tr.sample(stochastic=True, seed=s) for s in range(200)]
X_train_aug = np.hstack([X_train, synth])
y_train_aug = np.hstack([y_train, np.zeros(len(synth), int) ])

In [22]:
np.bincount(y_train_aug)

array([400, 553])

--- 
### MC

In [23]:
mcc = MarkovEntropyClassification()

In [24]:
mcc.fit(X_train_aug, y_train_aug)

In [25]:
y_hat_train = mcc.predict(X_train_aug)
np.mean(y_hat_train==y_train_aug)

1.0

In [26]:
y_hat_test = mcc.predict(X_test)
np.mean(y_hat_test==y_test)

0.87

---
### Bayes

In [27]:
bayes = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('bayes', MultinomialNB()),
])

In [28]:
bayes.fit(X_train_aug, y_train_aug)

In [29]:
y_hat_train = bayes.predict(X_train_aug)
np.mean(y_hat_train==y_train_aug)

0.9171038824763903

In [30]:
y_hat_test = bayes.predict(X_test)
np.mean(y_hat_test==y_test)

0.64

---
### SVM

In [31]:
svm = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])

In [32]:
svm.fit(X_train_aug, y_train_aug)

In [33]:
y_hat_train = svm.predict(X_train_aug)
np.mean(y_hat_train==y_train_aug)

1.0

In [34]:
y_hat_test = svm.predict(X_test)
np.mean(y_hat_test==y_test)

0.87

---

In [35]:
del y_hat_train, y_hat_test, svm, bayes, mcc,

---
---
---