In [73]:
# playing with The UCI ML News Aggregator Dataset http://archive.ics.uci.edu/ml/datasets/News+Aggregator
import numpy as np
import pandas as pd

In [74]:
#import data and add header row
news = pd.read_csv("NewsAggregatorDataset/newsCorpora.csv", sep="\t",
                  names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])

In [75]:
#first six rows
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [76]:
#lowercase text and removing punctuation in title to have smaller size of model in future
import string

def normalize(s):
    s = s.lower()
    s = s.translate(str.maketrans('','', string.punctuation))
    return s

news['TITLE'] = [normalize(s) for s in news['TITLE']]
    

In [77]:
#first six rows
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,fed official says weak data caused by weather ...,http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,feds charles plosser sees high bar for change ...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,us open stocks fall after fed official hints a...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,fed risks falling behind the curve charles plo...,http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,feds plosser nasty weather has curbed job growth,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [78]:
#import vectorizers and transformer
#use CountVectorizer + TfidfTransformer | TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

###### USE WITHOUT PIPELINE ########
#vectorize text in title
#vectorizer = CountVectorizer(stop_words = 'english')
#x_data = vectorizer.fit_transform(news['TITLE'])

In [79]:
# taken from Ritchie Ng post "Vectorization, Multinomial Naive Bayes Classifier and Evaluation"
# http://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/

# Why are we splitting into training and testing sets before vectorizing?

# If we vectorize then we train/test split, our document-term matrix would contain every single feature (word) 
# in the test and training sets
# What we want is to simulate the real world
# We would always see words we have not seen before
# so this method is not realistic and we cannot properly evaluate our models
# Split then vectorize (correct way)

# We do the train/test split before the CountVectorizer to properly simulate the real world
# where our future data contains words we have not seen before
# After you train your data and chose the best model,
# you would then train on all of your data before predicting actual future data to maximize learning.

In [80]:
#split data into train and test subsets
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(x_data, news['CATEGORY'], random_state=42)
X_train, X_test, y_train, y_test = train_test_split(news['TITLE'], news['CATEGORY'], random_state=42)

print(X_train.shape)
print(X_test.shape)
#(observations - labels(unique word), occurences - number of unique words in all corpus of headlines)

(316814,)
(105605,)


In [81]:
#building a classifier
from sklearn.naive_bayes import MultinomialNB

###### USE WITHOUT PIPELINE ########
#clf_mnb = MultinomialNB(alpha=.01).fit(X_train, y_train)

In [9]:
###### USE WITHOUT PIPELINE ########
#testing on test data
#predicted = clf_mnb.predict(X_test)
#np.mean(predicted == y_test)

0.9293025898394962

In [10]:
###### USE WITHOUT PIPELINE ########
#alternative to above & np
#clf_mnb.score(X_test, y_test)

0.9293025898394962

In [11]:
#predicted

array(['e', 'b', 'e', ..., 'e', 'e', 't'], dtype='<U1')

In [82]:
#to check all out metricts
from sklearn import metrics

#metrics.f1_score(y_test, predicted, average='macro')

In [13]:
from sklearn.metrics import classification_report
#print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          b       0.91      0.91      0.91     29028
          e       0.96      0.97      0.97     37799
          m       0.92      0.92      0.92     11365
          t       0.91      0.90      0.90     27413

avg / total       0.93      0.93      0.93    105605



In [83]:
#pipeline for Naive Bayse classifier

from sklearn.pipeline import Pipeline
clf_mnb = Pipeline([
        ('vect', CountVectorizer(stop_words='english')),
        #('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(alpha=.01)),
    ])


In [84]:
#.fit(train) - learns the vocabulary of the training data
clf_mnb = clf_mnb.fit(X_train, y_train)

predicted = clf_mnb.predict(X_test)
np.mean(predicted == y_test)

0.9295109133090289

In [85]:
#tune params in GridSearchCV

from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf_mnb = GridSearchCV(clf_mnb, parameters, n_jobs=-1)
gs_clf_mnb = gs_clf_mnb.fit(X_train, y_train)

In [86]:
#check out best score with tuned params
gs_clf_mnb.best_score_

0.9490142481077225

In [87]:
#best params for clf
gs_clf_mnb.best_params_

{'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)}

In [88]:
#apply GridSearch params on test data
predicted = gs_clf_mnb.predict(X_test)
np.mean(predicted == y_test)

0.9521897637422471

In [89]:
from sklearn.metrics import classification_report

#classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2)
#y_true = Ground truth (correct) target values.
#y_pred = Estimated targets as returned by a classifier.
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          b       0.94      0.93      0.93     29117
          e       0.98      0.98      0.98     37988
          m       0.95      0.95      0.95     11319
          t       0.93      0.94      0.94     27181

avg / total       0.95      0.95      0.95    105605



In [90]:
#confusion matrix for gs_clf_mnb
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

[[27155   345   298  1319]
 [  333 37157   129   369]
 [  279   207 10712   121]
 [ 1225   309   115 25532]]


In [91]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy score: ", accuracy_score(y_test, predicted))
print("Recall score: ", recall_score(y_test, predicted, average = 'weighted'))
print("Precision score: ", precision_score(y_test, predicted, average = 'weighted'))
print("F1 score: ", f1_score(y_test, predicted, average = 'weighted'))

Accuracy score:  0.9521897637422471
Recall score:  0.9521897637422471
Precision score:  0.9521917994708027
F1 score:  0.9521866148926127


In [108]:
#check false negatives
#false_n = X_test[predicted < y_test]
#false_n

In [118]:
# Pipeline for SVM Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
clf_svm = Pipeline([
        ('vect', CountVectorizer(stop_words='english')),
        ('clf', SGDClassifier(loss='hinge', alpha=1e-3, max_iter=5, random_state=42)),
    ])

In [119]:
clf_svm = clf_svm.fit(X_train, y_train)
predicted = clf_svm.predict(X_test)
np.mean(predicted == y_test)

0.8933762605937219

In [130]:
# GridSearch for SVM

parameters = {'vect__ngram_range': [(1, 3)],
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(clf_svm, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)
gs_clf_svm.best_score_

0.9111308212389604

In [131]:
gs_clf_svm.best_params_

{'clf__alpha': 0.001, 'vect__ngram_range': (1, 3)}

In [132]:
# apply GridSearch params svm on test data
predicted = gs_clf_svm.predict(X_test)
np.mean(predicted == y_test)

0.9107996780455471

In [133]:
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          b       0.90      0.89      0.89     29117
          e       0.91      0.98      0.94     37988
          m       0.94      0.81      0.87     11319
          t       0.91      0.88      0.90     27181

avg / total       0.91      0.91      0.91    105605



In [134]:
# confusion matrix for gs_clf_svm
print(confusion_matrix(y_test, predicted))

[[26014  1114   259  1730]
 [  516 37127   102   243]
 [  784  1092  9191   252]
 [ 1747  1373   208 23853]]
