In [65]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature selection classes
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# metrics
from sklearn import metrics

def showMetrics(targets, predictions, names):
    print(metrics.classification_report(targets, predictions, target_names=names))
    print("Accuracy = %6.4f " % metrics.accuracy_score(targets, predictions))
    print("Avg recall, micro = %6.4f" % metrics.recall_score(targets, predictions, average = 'micro'))
    print("Avg recall, macro = %6.4f" % metrics.recall_score(targets, predictions, average = 'macro'))
    print("Avg precision, macro=%6.4f" % metrics.precision_score(targets, predictions, average = 'macro'))

categories = ['talk.religion.misc', 'soc.religion.christian'];    

train = fetch_20newsgroups(
            subset = 'train',
            categories = categories,
            remove = ('headers', 'footers', 'quotes'),
            shuffle = True, 
            random_state = 42
        )

test = fetch_20newsgroups(
            subset='test',     # get test data
            categories=categories,
            remove=('headers', 'footers', 'quotes'),
            shuffle=True,
            random_state=42
        )

def classifier(train, test, use_stop_words):

    X, Y = train.data, train.target

    if use_stop_words:
        vectorizer = TfidfVectorizer()
    else:
        vectorizer = TfidfVectorizer(max_df=0.95, min_df=3, analyzer='word', stop_words="english")

    X_vec = vectorizer.fit_transform(X)   #transform training data

    fs = SelectKBest(chi2, k=100)    #get top k features 
    X_fs_vec= fs.fit_transform(X_vec, Y)  # fit and transform tdm to reduced feature space

    vectors_test = vectorizer.transform(test.data)   #transform test data
        
    fs_test      = fs.transform(vectors_test)     # transform test data to reduced feature space
    classifier = MultinomialNB(alpha=.01)
    classifier.fit(X_fs_vec, Y)
    return classifier.predict(fs_test)


In [66]:
print("-------------------------------")
print("Classifying over TEST dataset.")
print("------------------------------")


predicted = classifier(train, test, False)

showMetrics(test.target, predicted, test.target_names)

print("\n")
print("-------------------------------")
print("Classifying over TRAIN dataset.")
print("-------------------------------")

predicted = classifier(train, train, False)

showMetrics(train.target, predicted, train.target_names)

##############################################################
# Now we change the datasets to use 3 different categories.
##############################################################

categories = ['rec.sport.baseball', 'sci.electronics', 'comp.graphics'];    

train = fetch_20newsgroups(
            subset = 'train',
            categories = categories,
            remove = ('headers', 'footers', 'quotes'),
            shuffle = True, 
            random_state = 42
        )

test = fetch_20newsgroups(
            subset='test',     # get test data
            categories=categories,
            remove=('headers', 'footers', 'quotes'),
            shuffle=True,
            random_state=42
        )

print('\n');
print("-----------------------------")
print("Classifying using STOP WORDS.")
print("-----------------------------")
print("\n")

predicted = classifier(train, test, True)

showMetrics(test.target, predicted, test.target_names)

print("\n")
print("-------------------------------")
print("Classifying WITHOUT STOP WORDS ")
print("-------------------------------")

predicted = classifier(train, test, False)

showMetrics(test.target, predicted, test.target_names)



-------------------------------
Classifying over TEST dataset.
------------------------------
                        precision    recall  f1-score   support

soc.religion.christian       0.69      0.96      0.80       398
    talk.religion.misc       0.84      0.31      0.45       251

           avg / total       0.75      0.71      0.67       649

Accuracy = 0.7088 
Avg recall, micro = 0.7088
Avg recall, macro = 0.6345
Avg precision, macro=0.7623


-------------------------------
Classifying over TRAIN dataset.
-------------------------------
                        precision    recall  f1-score   support

soc.religion.christian       0.77      1.00      0.87       599
    talk.religion.misc       1.00      0.53      0.69       377

           avg / total       0.86      0.82      0.80       976

Accuracy = 0.8176 
Avg recall, micro = 0.8176
Avg recall, macro = 0.7639
Avg precision, macro=0.8855


-----------------------------
Classifying using STOP WORDS.
--------------------------

### Answers

TASK1.

The sample is unbalanced. There's more documents under "talk.religion.misc" category than "soc.religion.christian".

The precision (and overall performance) is better when we classify over the TRAIN dataset. That makes sense, because we have trained the classifier on this dataset.

A mix of Precission, Accuracy and Recall should be considered to asses algorithm's performance.

TASK2.

Classifying without stop word has an overall better performanc, according to precision. Specifically improves sci.electronics precision. 
