In [1]:
#playing with The UCI ML News Aggregator Dataset http://archive.ics.uci.edu/ml/datasets/News+Aggregator

import numpy as np
import pandas as pd



In [21]:
#import data and add header row
news = pd.read_csv("NewsAggregatorDataset/newsCorpora.csv", sep="\t",
                  names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])

In [22]:
#first six rows
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [23]:
#lowercase text and removing punctuation in title to have smaller size of model in future
import string

def normalize(s):
    s = s.lower()
    s = s.translate(str.maketrans('','', string.punctuation))
    return s

news['TITLE'] = [normalize(s) for s in news['TITLE']]
    

In [24]:
#first six rows
news.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,fed official says weak data caused by weather ...,http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,feds charles plosser sees high bar for change ...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,us open stocks fall after fed official hints a...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,fed risks falling behind the curve charles plo...,http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,feds plosser nasty weather has curbed job growth,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [25]:
#import vectorizers and transformer
#use CountVectorizer + TfidfTransformer | TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

###### USE WITHOUT PIPELINE ########
#vectorize text in title
#vectorizer = CountVectorizer(stop_words = 'english')
#x_data = vectorizer.fit_transform(news['TITLE'])

In [26]:
#split data into train and test subsets
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(x_data, news['CATEGORY'], random_state=42)
X_train, X_test, y_train, y_test = train_test_split(news['TITLE'], news['CATEGORY'], random_state=42)

print(X_train.shape)
print(X_test.shape)
#(observations - labels(unique word), occurences - number of unique words in all corpus of headlines)

(316814,)
(105605,)


In [8]:
#building a classifier
from sklearn.naive_bayes import MultinomialNB

###### USE WITHOUT PIPELINE ########
#clf_mnb = MultinomialNB(alpha=.01).fit(X_train, y_train)

In [9]:
###### USE WITHOUT PIPELINE ########
#testing on test data
#predicted = clf_mnb.predict(X_test)
#np.mean(predicted == y_test)

0.9293025898394962

In [10]:
###### USE WITHOUT PIPELINE ########
#alternative to above & np
#clf_mnb.score(X_test, y_test)

0.9293025898394962

In [11]:
#predicted

array(['e', 'b', 'e', ..., 'e', 'e', 't'], dtype='<U1')

In [12]:
#to check all out metricts
from sklearn import metrics

#metrics.f1_score(y_test, predicted, average='macro')

0.925104016842546

In [13]:
from sklearn.metrics import classification_report
#print(classification_report(predicted, y_test))

             precision    recall  f1-score   support

          b       0.91      0.91      0.91     29028
          e       0.96      0.97      0.97     37799
          m       0.92      0.92      0.92     11365
          t       0.91      0.90      0.90     27413

avg / total       0.93      0.93      0.93    105605



In [48]:
#pipeline for naive bayse

from sklearn.pipeline import Pipeline
clf_mnb = Pipeline([
        ('vect', CountVectorizer(stop_words='english')),
        #('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(alpha=.01)),
    ])


In [49]:
clf_mnb = clf_mnb.fit(X_train, y_train)
predicted = clf_mnb.predict(X_test)
np.mean(predicted == y_test)

0.9295109133090289

In [50]:
#tune params in GridSearchCV

from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              #'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf_mnb = GridSearchCV(clf_mnb, parameters, n_jobs=-1)
gs_clf_mnb = gs_clf_mnb.fit(X_train, y_train)

In [51]:
#check out best score with tuned params
gs_clf_mnb.best_score_

0.9490142481077225

In [52]:
#best params for clf
gs_clf_mnb.best_params_

{'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)}

In [53]:
#apply GridSearch params on test data
predicted = gs_clf_mnb.predict(X_test)
np.mean(predicted == y_test)

0.9521897637422471

In [54]:
from sklearn.metrics import classification_report
print(classification_report(predicted, y_test))

             precision    recall  f1-score   support

          b       0.93      0.94      0.93     28992
          e       0.98      0.98      0.98     38018
          m       0.95      0.95      0.95     11254
          t       0.94      0.93      0.94     27341

avg / total       0.95      0.95      0.95    105605

