In [51]:
#import functions
import collections
import nltk
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier
from string import punctuation
from nltk import pos_tag
from pickle import dump
from nltk.corpus import stopwords

#parameters switch for each parts
PART_A = False
PART_B = False
PART_E = False
PART_F = False
PART_G = False
PART_H = False
PART_I = False
PART_J = True
#change min_dfs
min_dfs = 2#2 or 5

#if need download, decomment these lines
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download()
#nltk.download('stopwords' )

#For all parts instead of multiclass and part C
comp_categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
rec_categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
combine_train = fetch_20newsgroups(subset='train', categories=comp_categories+rec_categories, shuffle=True, random_state=42,)
combine_test = fetch_20newsgroups(subset='test', categories=comp_categories+rec_categories, shuffle=True, random_state=42,)

#for complete stop_words and build analyzer
stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

#analyzer for part B
analyzer = CountVectorizer().build_analyzer()
wnl = nltk.wordnet.WordNetLemmatizer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
def stem_rmv_punc(doc):
    return (word for word in lemmatize_sent(analyzer(doc)) if word not in combined_stopwords and not word.isdigit())
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
def lemmatize_sent(list_word): 
    # Text input is string, returns array of lowercased strings(words).
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(list_word)]

#plotting function
%matplotlib inline
def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)

#predict and plotting for most part
def fit_predict_and_plot_roc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)
    predicted_l = pipe.predict(test_data)
    #confusion matrix
    confusion_l = confusion_matrix(test_label, predicted_l)
    print("confusion matrix:")
    print(confusion_l)
    print("accuracy:")
    acc = accuracy_score(test_label, predicted_l)
    print(acc)
    print("recall:")
    rec = recall_score(test_label, predicted_l)
    print(rec)
    print("precision:")
    pre = precision_score(test_label, predicted_l)
    print(pre)
    prob_score = pipe.predict_proba(test_data)
    fpr, tpr, _ = roc_curve(test_label, prob_score[:,1])   
    plot_roc(fpr, tpr)
    return pipe

#predict for multiclass part
def fit_predict_and_plot_roc_multi(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)
    predicted_l = pipe.predict(test_data)
    #confusion matrix
    confusion_l = confusion_matrix(test_label, predicted_l)
    print("confusion matrix:")
    print(confusion_l)
    print("accuracy:")
    acc = accuracy_score(test_label, predicted_l)
    print(acc)
    print("recall:")
    rec = recall_score(test_label, predicted_l, average = 'micro')
    print(rec)
    print("precision:")
    pre = precision_score(test_label, predicted_l, average = 'micro')
    print(pre)
    prob_score = pipe.predict_proba(test_data)

    return pipe

#predict and report only accuracy for k-fold
def fit_predict_acc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)
    predicted_l = pipe.predict(test_data)
    acc = accuracy_score(test_label, predicted_l)
    return acc

class SparseToDenseArray(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, X, *_):
        if hasattr(X, 'toarray'):
            return X.toarray()
        return X

    def fit(self, *_):
        return self


#usual count_vect
count_vect = CountVectorizer(min_df=min_dfs,stop_words = 'english')#use CountVectorizer(min_df=min_dfs,analyzer=stem_rmv_punc)
#part A
if PART_A:
    print("--------PART_A BEGINs--------")
    histy = []
    for i in range(8):
        histy.append(0)
    
    for i in range(len(combine_train.target)):
        histy[combine_train.target[i]] = histy[combine_train.target[i]] + 1
    print ("histogram data:")
    print (histy)
    x = [0,1,2,3,4,5,6,7]
    plt.bar(x, histy)
    plt.title('Histogram of the number of training docs per class')
    plt.show()
    for i in range(8):
        print("Topic %d is %s" %(i, combine_train.target_names[i]))
        
    print("--------PART_A ENDs--------")

#part B  
if PART_B:
    print("--------PART_B BEGINs--------")
    count_vect1 = CountVectorizer(min_df=min_dfs, analyzer=stem_rmv_punc)    
    count_train1_count = count_vect1.fit_transform(combine_train.data)
    count_trina1_tfidf = tfidf_transformer.fit_transform(count_train1_count)
    print("min_df is: %d"%min_dfs)
    print("shape of the final matrix:")
    print(count_trina1_tfidf.shape)
    print("--------PART_B ENDs--------")
    
#part C in another program

# relabel datas for all the remaining part except multiclass
for i in range(len(combine_train.target)):
    if(combine_train.target[i] == 0 or combine_train.target[i] == 1 or combine_train.target[i] == 2 or combine_train.target[i] == 3):
        combine_train.target[i] = 0
    if(combine_train.target[i] >= 4):
        combine_train.target[i] = 1

for i in range(len(combine_test.target)):
    if(combine_test.target[i] <= 3):
        combine_test.target[i] = 0
    if(combine_test.target[i] >= 4):
        combine_test.target[i] = 1
    
for i in range(4):
    combine_train.target_names[i] = 'Computer Technology'
    combine_test.target_names[i] = 'Computer Technology'
    combine_train.target_names[i + 4] = 'Recreational Activity'
    combine_test.target_names[i + 4] = 'Recreational Activity'

#part E
#change the pipeline manually for different tasks (LSI and NMF)
if PART_E :
    #Hard SVC LSI
    print("--------PART_E BEGINs--------")
    pipeline1 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('clf',SVC(C = 1000, probability=True)),
    ])
    #Soft SVC LSI
    pipeline2 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('clf',SVC(C = 0.001, probability=True)),
    ])
    #Hard SVC NMF
    pipeline3 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('clf',SVC(C = 1000, probability=True)),
    ])
    #soft SVC NMF
    pipeline4 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('clf',SVC(C = 0.001, probability=True)),
    ])
    print("min_df is: %d"%min_dfs)
    fit_predict_and_plot_roc(pipeline4, combine_train.data, combine_train.target, combine_test.data, combine_test.target)
    print("--------PART_E ENDs--------")

    
#PART_F
#manually change the pipeline for LSI and NMF
if PART_F :
    print("--------PART_F BEGINs--------")
    print("min_df is: %d"%min_dfs)
    sum = 0
    kf = KFold(n_splits = 5)
    kf.get_n_splits(combine_train.data)
    for i in range(-3, 4):
        newC = math.pow(10, i)
        sum = 0
        #pipeline for LSI
        pipeline5 = Pipeline([
            ('vect', count_vect),
            ('tfidf', TfidfTransformer()),
            ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
            ('toarr', SparseToDenseArray()),
            ('clf',SVC(C = newC, probability=True)),
            ])
        #pipeline for NMF
        pipeline6 = Pipeline([
            ('vect', count_vect),
            ('tfidf', TfidfTransformer()),
            ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
            ('toarr', SparseToDenseArray()),
            ('clf',SVC(C = newC, probability=True)),
            ])
        for train, test in kf.split(combine_train.data):  
            train_kf_data = np.array(combine_train.data)[train]
            test_kf_data = np.array(combine_train.data)[test]
            train_kf_label = np.array(combine_train.target)[train]
            test_kf_label = np.array(combine_train.target)[test]    
            sum = sum+fit_predict_acc(pipeline5, train_kf_data, train_kf_label, test_kf_data, test_kf_label) #change the pipeline for different task
        average_acc = sum/5
        print("average_acc for C = %f:"%newC)
        print(average_acc)
    print("--------PART_F ENDs--------")
    
#PART_G    
#manually change the class_prior
if PART_G :
    print("--------PART_G BEGINs--------")
    print("min_df is: %d"%min_dfs)
    pipeline7 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',MultinomialNB(class_prior=[.4, .6])),#change the threshold here
    ])
    fit_predict_and_plot_roc(pipeline7, combine_train.data, combine_train.target, combine_test.data, combine_test.target)
    print("--------PART_G ENDs--------")
    
#PART_H
#manually change the weight and pipelines for LSI and NMF
if PART_H :
    print("--------PART_H BEGINs--------")
    print("min_df is: %d"%min_dfs)
    # LSI logistics regression 
    pipeline8 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',LogisticRegression(class_weight = {0:0.8, 1:0.2})), #change the weight for now is 4 : 1
    ])

    # NMF logistics regression 
    pipeline9 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',LogisticRegression(class_weight = {0:0.8, 1:0.2})), #change the weight for now is 4 : 1
    ])

    fit_predict_and_plot_roc(pipeline8, combine_train.data, combine_train.target, combine_test.data, combine_test.target)
    print("--------PART_H ENDs--------")
        
#PART_I        
#manually change the pipeline and penalty for different task
if PART_I :
    print("--------PART_I BEGINs--------")
    for i in range(-2, 3, 2): #sweep from 0.01 to 100
        print("min_df is: %d"%min_dfs)
        newC = math.pow(10, i)
        # LSI logistics regression 
        pipeline10 = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('reduce_dim',TruncatedSVD(n_components=50, random_state=0)),
        ('toarr', SparseToDenseArray()),
        ('clf',LogisticRegression(penalty = 'l1', C = newC)), #change to be l1 or l2
        ])
        # NMF logistics regression
        pipeline11 = Pipeline([
        ('vect', count_vect),
        ('tfidf', TfidfTransformer()),
        ('reduce_dim',NMF(n_components=50, init='random', random_state=0)),
        ('toarr', SparseToDenseArray()),
        ('clf',LogisticRegression(penalty = 'l1', C = newC)), #change to be l1 or l2
        ])
        acc = fit_predict_acc(pipeline10, combine_train.data, combine_train.target, combine_test.data, combine_test.target)
        print("accuracy for C = %f:"%newC)
        print(acc)
    print("--------PART_I ENDs--------")   
        
#multiclass PART_J
if PART_J :
    print("--------PART_J BEGINs--------")
    print("min_df is: %d"%min_dfs)
    multi_categories = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian']
    multi_train = fetch_20newsgroups(subset='train', categories=multi_categories, shuffle=True, random_state=42,)
    multi_test = fetch_20newsgroups(subset='test', categories=multi_categories, shuffle=True, random_state=42,)
    
    #multiclass NB pipeline
    pipeline12 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',MultinomialNB()),
    ])
    
    #multiclass SVC NMF one vs one pipeline
    pipeline13 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',SVC(C = 1000, probability = True)),
    ])
    
    #multiclass SVC LSI one vs one pipeline
    pipeline14 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',SVC(C = 1000, probability = True)),
    ])
    
    #multiclass SVC NMF one vs rest pipeline
    pipeline15 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', NMF(n_components=50, init='random', random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',OneVsRestClassifier(SVC(C = 1000, probability = True))),
    ])
    
    #multiclass SVC LSI one vs rest pipeline
    pipeline16 = Pipeline([
    ('vect', count_vect),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(n_components=50, random_state=0)),
    ('toarr', SparseToDenseArray()),
    ('clf',OneVsRestClassifier(SVC(C = 1000, probability = True))),
    ])
    
    print("Multiclass MultiNomialNB result:")
    fit_predict_and_plot_roc_multi(pipeline12, multi_train.data, multi_train.target, multi_test.data, multi_test.target)
    print("multiclass SVC NMF one vs one result:")
    fit_predict_and_plot_roc_multi(pipeline13, multi_train.data, multi_train.target, multi_test.data, multi_test.target)
    print("multiclass SVC LSI one vs one result:")
    fit_predict_and_plot_roc_multi(pipeline14, multi_train.data, multi_train.target, multi_test.data, multi_test.target)
    print("multiclass SVC NMF one vs rest result:")
    fit_predict_and_plot_roc_multi(pipeline15, multi_train.data, multi_train.target, multi_test.data, multi_test.target)
    print("multiclass SVC LSI one vs rest result:")
    fit_predict_and_plot_roc_multi(pipeline16, multi_train.data, multi_train.target, multi_test.data, multi_test.target)
    
    print("--------PART_J ENDs--------")


--------PART_J BEGINs--------
min_df is: 2
Multiclass MultiNomialNB result:
confusion matrix:
[[319  19  48   6]
 [106 216  54   9]
 [ 37   7 340   6]
 [  3   0   5 390]]
accuracy:
0.808306709265
recall:
0.808306709265
precision:
0.808306709265
multiclass SVC NMF one vs one result:
confusion matrix:
[[327  45  20   0]
 [ 73 292  19   1]
 [ 36  16 337   1]
 [ 13   3   4 378]]
accuracy:
0.852396166134
recall:
0.852396166134
precision:
0.852396166134
multiclass SVC LSI one vs one result:
confusion matrix:
[[324  45  21   2]
 [ 40 322  22   1]
 [ 24  13 351   2]
 [  4   2   0 392]]
accuracy:
0.887539936102
recall:
0.887539936102
precision:
0.887539936102
multiclass SVC NMF one vs rest result:
confusion matrix:
[[312  45  31   4]
 [ 54 300  26   5]
 [ 20  12 354   4]
 [  2   2   2 392]]
accuracy:
0.867731629393
recall:
0.867731629393
precision:
0.867731629393
multiclass SVC LSI one vs rest result:
confusion matrix:
[[320  49  23   0]
 [ 33 323  29   0]
 [ 16  16 355   3]
 [  3   1   1 393]]