In [1]:
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import numpy as np

## 1. Raw Data

In [7]:
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']

train_dataset = fetch_20newsgroups(subset = 'train', categories = categories, shuffle=True,random_state=None)
test_dataset = fetch_20newsgroups(subset = 'test', categories = categories, shuffle=True,random_state=None)

## 2. Preprocessing

In [3]:
## stop words
import nltk
from sklearn.feature_extraction import text
stop_words_skt = text.ENGLISH_STOP_WORDS
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words_en = stopwords.words('english')
from string import punctuation
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

In [4]:
## Lemmatizer
from nltk import pos_tag
# nltk.download('punkt')#, if you need "tokenizers/punkt/english.pickle", choose it
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
wnl = nltk.wordnet.WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def lemmatize_sent_demo(text):
    # Text input is string, returns array of lowercased strings(words).
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(nltk.word_tokenize(text))]
def lemmatize_sent(list_word):
    # Text input is string, returns array of lowercased strings(words).
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(list_word)]

In [5]:
# overwrite analyzer with callable function:
from sklearn.feature_extraction.text import CountVectorizer
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
def stem_rmv_punc(doc):
    return (word for word in lemmatize_sent(analyzer(doc)) if word not in combined_stopwords and not is_number(word))

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

## 2. Pipelines

In [15]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF


from sklearn.pipeline import Pipeline

### (1) pipelines

In [31]:
# Naive Bayes
pipe_min_df_3_NMF_NB = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('clf', GaussianNB()),
])

pipe_min_df_3_LSI_NB = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('clf', GaussianNB()),
])

# SVM one vs one
pipeline_min_df_3_NMF_SVM_ovo = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('clf', SVC(C=1,gamma=10,probability=True, decision_function_shape='ovo'))])

pipeline_min_df_3_LSI_SVM_ovo = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('clf', SVC(C=1,gamma=10,probability=True, decision_function_shape='ovo'))])

# SVM one vs rest
pipeline_min_df_3_NMF_SVM_ovr = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('clf', LinearSVC())])

pipeline_min_df_3_LSI_SVM_ovr = Pipeline([
    ('vect', CountVectorizer(min_df=3, analyzer=stem_rmv_punc, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('clf', LinearSVC())])

### (2) helper functions

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
%matplotlib inline

def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)
        
def fit_predict_and_plot_roc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)
    
    predicted_labels = pipe.predict(test_data)
    print('confusion_matrix =', '\n', confusion_matrix(test_label, predicted_labels))
    print('accuracy score =', accuracy_score(test_label, predicted_labels))
    print('recall score =', recall_score(test_label, predicted_labels,average='macro'))
    print('precision score =', precision_score(test_label, predicted_labels, average='macro'))
    print('F1 score =', f1_score(test_label, predicted_labels, average='macro'))

### (3) results

In [28]:
# SVM one vs one
fit_predict_and_plot_roc(pipeline_min_df_3_NMF_SVM_ovo, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[303  64  25   0]
 [ 76 281  27   1]
 [ 47  22 320   1]
 [  9   2   8 379]]
accuracy score = 0.819808306709
recall score = 0.818900860147
precision score = 0.82373131466
F1 score = 0.820584582024


In [33]:
# SVM one vs one
fit_predict_and_plot_roc(pipeline_min_df_3_LSI_SVM_ovo, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[326  39  26   1]
 [ 44 314  27   0]
 [ 22  18 348   2]
 [  6   1   1 390]]
accuracy score = 0.880511182109
recall score = 0.87985606461
precision score = 0.880304888224
F1 score = 0.87994709393


In [29]:
# SVM one vs rest
fit_predict_and_plot_roc(pipeline_min_df_3_NMF_SVM_ovr, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[282  65  43   2]
 [ 68 278  35   4]
 [ 41  19 327   3]
 [  0   1   9 388]]
accuracy score = 0.814696485623
recall score = 0.813700396875
precision score = 0.813563222286
F1 score = 0.813288871546


In [34]:
# SVM one vs rest
fit_predict_and_plot_roc(pipeline_min_df_3_LSI_SVM_ovr, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[314  53  25   0]
 [ 42 316  24   3]
 [ 20  20 348   2]
 [  4   1   1 392]]
accuracy score = 0.875399361022
recall score = 0.874757986091
precision score = 0.874587400147
F1 score = 0.874591906634


In [35]:
# Naive Bayes
fit_predict_and_plot_roc(pipe_min_df_3_NMF_NB, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[287  50  46   9]
 [121 208  51   5]
 [ 52  45 284   9]
 [  2   1   3 392]]
accuracy score = 0.748242811502
recall score = 0.746383087181
precision score = 0.747396073529
F1 score = 0.743521126236


In [32]:
# Naive Bayes
fit_predict_and_plot_roc(pipe_min_df_3_LSI_NB, train_dataset.data, train_dataset.target, test_dataset.data, test_dataset.target)

confusion_matrix = 
 [[228  34 127   3]
 [ 95 158 130   2]
 [ 48  39 299   4]
 [  0   0  17 381]]
accuracy score = 0.681150159744
recall score = 0.67899384057
precision score = 0.699319006424
F1 score = 0.674652274163
