In [1]:
from sklearn.feature_extraction import DictVectorizer

In [2]:
measurements=[{'city':'Dubai', 'temperature':33.},\
              {'city':'London', 'temperature':12.},\
              {'city':'San Fransisco', 'temperature':18.}]

In [3]:
vec = DictVectorizer()

In [6]:
print(vec.fit_transform(measurements).toarray())
print(vec.get_feature_names())

[[ 1.  0.  0. 33.]
 [ 0.  1.  0. 12.]
 [ 0.  0.  1. 18.]]
['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']


In [7]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(data_home='d:/Data/Python2Kaggle/20news_home/', subset='all')

In [9]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(news.data, news.target,\
                                                   test_size=0.25, random_state=33)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
vec_count = CountVectorizer()

In [13]:
X_count_train = vec_count.fit_transform(X_train)

In [14]:
X_count_test = vec_count.transform(X_test)

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
mnb_count = MultinomialNB()

In [18]:
mnb_count.fit(X_count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
print('The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer without filtering stopwords):',\
      mnb_count.score(X_count_test, y_test))

The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer without filtering stopwords): 0.8397707979626485


In [20]:
y_count_pred = mnb_count.predict(X_count_test)

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_test, y_count_pred, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
         

# TfidfVectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
vec_tfidf = TfidfVectorizer()

In [25]:
X_tfidf_train = vec_tfidf.fit_transform(X_train)

In [26]:
X_tfidf_test = vec_tfidf.transform(X_test)

In [28]:
mnb_tfidf = MultinomialNB()

In [29]:
mnb_tfidf.fit(X_tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
print('The accuracy of classifying 20newsgroups using Naive Bayes (TfidfVectorizer without filtering stopwords):',\
      mnb_tfidf.score(X_tfidf_test, y_test))

The accuracy of classifying 20newsgroups using Naive Bayes (TfidfVectorizer without filtering stopwords): 0.8463497453310697


In [32]:
y_tfidf_pred = mnb_tfidf.predict(X_tfidf_test)

In [33]:
print(classification_report(y_test, y_tfidf_pred, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.67      0.75       201
           comp.graphics       0.85      0.74      0.79       250
 comp.os.ms-windows.misc       0.82      0.85      0.83       248
comp.sys.ibm.pc.hardware       0.76      0.88      0.82       240
   comp.sys.mac.hardware       0.94      0.84      0.89       242
          comp.windows.x       0.96      0.84      0.89       263
            misc.forsale       0.93      0.69      0.79       257
               rec.autos       0.84      0.92      0.88       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.96      0.91      0.94       251
        rec.sport.hockey       0.88      0.99      0.93       233
               sci.crypt       0.73      0.98      0.83       238
         sci.electronics       0.91      0.83      0.87       249
                 sci.med       0.97      0.92      0.95       245
         

# CountVectorizer and TfidfVectorizeer with filtering

In [34]:
# vec_filter_count, vec_filter_tfidf = 
# CountVectorizer(analyzer='word', stop_words='english'), 
# TfidfVectorizer(analyzer='word', stop_words='english')
vec_filter_count = CountVectorizer(analyzer='word', stop_words='english')
vec_filter_tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [35]:
X_filter_count_train = vec_filter_count.fit_transform(X_train)
X_filter_tfidf_train = vec_filter_tfidf.fit_transform(X_train)

In [36]:
X_filter_count_test = vec_filter_count.transform(X_test)
X_filter_tfidf_test = vec_filter_tfidf.transform(X_test)

In [50]:
mnb_filter_count = MultinomialNB()

mnb_filter_count.fit(X_filter_count_train, y_train)

print('The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer with filtering stopwords):',\
      mnb_filter_count.score(X_filter_count_test, y_test))

y_filter_count_pred = mnb_filter_count.predict(X_filter_count_test)

print(classification_report(y_test, y_filter_count_pred, target_names=news.target_names))

The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer with filtering stopwords): 0.8637521222410866
                          precision    recall  f1-score   support

             alt.atheism       0.85      0.89      0.87       201
           comp.graphics       0.62      0.88      0.73       250
 comp.os.ms-windows.misc       0.93      0.22      0.36       248
comp.sys.ibm.pc.hardware       0.62      0.88      0.73       240
   comp.sys.mac.hardware       0.93      0.85      0.89       242
          comp.windows.x       0.82      0.85      0.84       263
            misc.forsale       0.90      0.79      0.84       257
               rec.autos       0.91      0.91      0.91       238
         rec.motorcycles       0.98      0.94      0.96       276
      rec.sport.baseball       0.98      0.92      0.95       251
        rec.sport.hockey       0.92      0.99      0.95       233
               sci.crypt       0.91      0.97      0.93       238
         sci.electr

In [52]:
mnb_filter_tfidf = MultinomialNB()

mnb_filter_tfidf.fit(X_filter_tfidf_train, y_train)

print('The accuracy of classifying 20newsgroups using Naive Bayes (TfidfVectorizer with filtering stopwords):',\
      mnb_filter_tfidf.score(X_filter_tfidf_test, y_test))

y_filter_tfidf_pred = mnb_filter_tfidf.predict(X_filter_tfidf_test)

print(classification_report(y_test, y_filter_tfidf_pred, target_names=news.target_names))

The accuracy of classifying 20newsgroups using Naive Bayes (TfidfVectorizer with filtering stopwords): 0.8826400679117148
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.81      0.83       201
           comp.graphics       0.85      0.81      0.83       250
 comp.os.ms-windows.misc       0.84      0.87      0.86       248
comp.sys.ibm.pc.hardware       0.78      0.88      0.83       240
   comp.sys.mac.hardware       0.92      0.90      0.91       242
          comp.windows.x       0.95      0.88      0.91       263
            misc.forsale       0.90      0.80      0.85       257
               rec.autos       0.89      0.92      0.90       238
         rec.motorcycles       0.98      0.94      0.96       276
      rec.sport.baseball       0.97      0.93      0.95       251
        rec.sport.hockey       0.88      0.99      0.93       233
               sci.crypt       0.85      0.98      0.91       238
         sci.electr