# TASK1: Naive Bayes, three categories including headers, footers and quotes and stop words

In [18]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np


categories = ['rec.sport.baseball', 'sci.electronics', 'comp.graphics']
twentyTrain = fetch_20newsgroups(subset='train', 
                                 categories=categories, 
                                 shuffle=True,
                                 random_state=42
                                )

twentyTrain.target_names

count_vect = CountVectorizer()        #can change this with analyser and ngram_range params for char n-grams
count_vect  #shows the default parameters

tdm = count_vect.fit_transform(twentyTrain.data)   #tdm is a matrix - 2-d array

transformer = TfidfTransformer()   
transformer   #check parameters - they allow for tf vs tfidf and l1 vs l2 normalisation

tdm_tfidf = transformer.fit_transform(tdm)   #transform the TDM
tdm_tfidf.shape


clf = MultinomialNB().fit(tdm_tfidf, twentyTrain.target)    #build the classifier (data, classes)

docs_test = ['I am sick', 'No more gun control']      #set up 2 test instances

# transform the test data in the same way as the training through CountVector and TfidfTransformer
test_counts = count_vect.transform(docs_test)       # don't fit as the vocab has been generated from the training data
test_tfidf = transformer.transform(test_counts)

predicted = clf.predict(test_tfidf)   #predict  


from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),])

text_clf.fit(twentyTrain.data, twentyTrain.target)  

twenty_test = fetch_20newsgroups(subset='test', 
                                 categories=categories, 
                                 shuffle=True, 
                                 random_state=42)  
docs_test = twenty_test.data

predicted = text_clf.predict(docs_test)   # predict
np.mean(predicted == twenty_test.target)  #report accuracy

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))    #print classification results

                    precision    recall  f1-score   support

     comp.graphics       0.94      0.87      0.91       389
rec.sport.baseball       0.94      0.99      0.97       397
   sci.electronics       0.91      0.93      0.92       393

       avg / total       0.93      0.93      0.93      1179

