In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

### Source URL to the original dataset

http://qwone.com/~jason/20Newsgroups/

In [30]:
newsgroups_df = pd.read_csv("datasets/newsgroups_sampled.csv")

The unique target values are

- 'alt.atheism',
- 'comp.graphics',
- 'comp.os.ms-windows.misc',
- 'comp.sys.ibm.pc.hardware',
- 'comp.sys.mac.hardware',
- 'comp.windows.x',
- 'misc.forsale',
- 'rec.autos',
- 'rec.motorcycles',
- 'rec.sport.baseball',
- 'rec.sport.hockey',
- 'sci.crypt',
- 'sci.electronics',
- 'sci.med',
- 'sci.space',
- 'soc.religion.christian',
- 'talk.politics.guns',
- 'talk.politics.mideast',
- 'talk.politics.misc',
- 'talk.religion.misc'

In [31]:
newsgroups_df.shape

(3000, 2)

In [32]:
sorted(newsgroups_df["target"].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [33]:
x = newsgroups_df["text"]

y = newsgroups_df["target"]

In [34]:
x.head()

0    from: hades@coos.dartmouth.edu (brian v. hughe...
1    from: steveth@netcom.com (steve thomas)\nsubje...
2    from: lmp8913@rigel.tamu.edu (preston, lisa m)...
3    from: rene@hardy.u.washington.edu (rene magrit...
4    from: wtm@uhura.neoucom.edu (bill mayhew)\nsub...
Name: text, dtype: object

In [35]:
y.head()

0     4
1    18
2     1
3     2
4    12
Name: target, dtype: int64

### Evaluating a classification model

Accuracy: Proportion of correctly predicted records

In [36]:
def classification_details(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)

    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)


### Build and train a model using word frequencies

In [37]:
count_vectorizer = CountVectorizer()

feature_vector = count_vectorizer.fit_transform(x)

feature_vector.shape

(3000, 57739)

In [17]:
print(feature_vector[0])

  (0, 24888)	1
  (0, 26908)	3
  (0, 17816)	1
  (0, 18977)	3
  (0, 21688)	2
  (0, 13984)	1
  (0, 28387)	1
  (0, 49768)	1
  (0, 43879)	1
  (0, 32799)	4
  (0, 28918)	3
  (0, 38592)	4
  (0, 15221)	1
  (0, 44648)	1
  (0, 51770)	3
  (0, 39513)	1
  (0, 16861)	1
  (0, 27093)	1
  (0, 38000)	1
  (0, 20309)	1
  (0, 40877)	1
  (0, 43940)	1
  (0, 20801)	2
  (0, 15308)	1
  (0, 55705)	1
  :	:
  (0, 15375)	1
  (0, 51296)	1
  (0, 20737)	1
  (0, 51418)	1
  (0, 35234)	1
  (0, 29255)	1
  (0, 39052)	2
  (0, 15376)	1
  (0, 13545)	1
  (0, 57150)	1
  (0, 51266)	1
  (0, 22940)	1
  (0, 55619)	1
  (0, 30358)	1
  (0, 35240)	1
  (0, 51272)	1
  (0, 23569)	1
  (0, 34339)	1
  (0, 15145)	1
  (0, 39308)	1
  (0, 53787)	2
  (0, 15307)	1
  (0, 11168)	1
  (0, 38413)	1
  (0, 9226)	1


In [19]:
x_dense = feature_vector.todense()

x_dense.shape

(3000, 57739)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x_dense, y, test_size = 0.25)

In [21]:
x_train.shape, x_test.shape

((2250, 57739), (750, 57739))

In [22]:
y_train.shape, y_test.shape

((2250,), (750,))

In [24]:
clf = GaussianNB().fit(x_train, y_train)

In [38]:
y_pred = clf.predict(x_test)

y_pred[0:10]

array([10,  4,  7,  9, 14,  7, 16, 19,  3,  2])

In [39]:
classification_details(y_test.values, y_pred)

Length of testing data:  750
accuracy_count :  538
accuracy_score :  0.7173333333333334


### Build and train a model using TF-IDF scores

In [46]:
tfidf_vectorizer = TfidfVectorizer()

feature_vector = tfidf_vectorizer.fit_transform(x)

feature_vector.shape

(3000, 57739)

In [47]:
x_dense = feature_vector.todense()

x_dense.shape

(3000, 57739)

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x_dense, y, test_size = 0.25)

In [49]:
clf = GaussianNB().fit(x_train, y_train)

In [50]:
y_pred = clf.predict(x_test)

y_pred[0:10]

array([12,  2,  7,  9, 13, 16, 10, 13,  2, 12])

In [51]:
classification_details(y_test.values, y_pred)

Length of testing data:  750
accuracy_count :  541
accuracy_score :  0.7213333333333334


### Build and train a model using n-gram frequencies

In [52]:
ngram_vectorizer = CountVectorizer(ngram_range = (2, 2))

feature_vector = ngram_vectorizer.fit_transform(x)

feature_vector.shape

(3000, 400938)

In [53]:
x_dense = feature_vector.todense()

x_dense.shape

(3000, 400938)

In [54]:
x_train, x_test, y_train, y_test = train_test_split(x_dense, y, test_size = 0.25)

In [55]:
clf = GaussianNB().fit(x_train, y_train)

In [56]:
y_pred = clf.predict(x_test)

y_pred[0:10]

array([16,  8,  4,  7,  3, 16,  0,  6,  6, 12])

In [57]:
classification_details(y_test.values, y_pred)

Length of testing data:  750
accuracy_count :  561
accuracy_score :  0.748
