# Naive Bayes

## Importing the libraries

In [30]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

## Importing the dataset and Splitting the dataset into the Training set and Test set

In [32]:
data_train = fetch_20newsgroups(subset='train', categories=None,
                                shuffle=True, random_state=42,
                                remove=('headers', 'footers', 'quotes'))

data_test = fetch_20newsgroups(subset='test', categories=None,
                               shuffle=True, random_state=42,
                               remove=('headers', 'footers', 'quotes'))
X_train = data_train.data
y_train = data_train.target
X_test = data_test.data
y_test = data_test.target

## Feature Scaling

In [38]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [43]:
print(X_train)

  (0, 95844)	4
  (0, 97181)	1
  (0, 48754)	2
  (0, 18915)	2
  (0, 68847)	1
  (0, 88638)	1
  (0, 30074)	1
  (0, 37335)	1
  (0, 60560)	1
  (0, 68080)	2
  (0, 88767)	4
  (0, 25775)	4
  (0, 80623)	1
  (0, 88532)	6
  (0, 68781)	1
  (0, 31990)	1
  (0, 51326)	2
  (0, 34809)	1
  (0, 84538)	1
  (0, 57390)	1
  (0, 89360)	1
  (0, 21987)	1
  (0, 41715)	2
  (0, 55746)	1
  (0, 9843)	1
  :	:
  (11313, 40387)	1
  (11313, 81792)	1
  (11313, 81742)	1
  (11313, 96497)	1
  (11313, 89804)	1
  (11313, 23302)	1
  (11313, 82660)	1
  (11313, 85524)	1
  (11313, 70066)	1
  (11313, 21258)	1
  (11313, 62086)	1
  (11313, 71992)	1
  (11313, 87730)	1
  (11313, 84605)	1
  (11313, 61975)	1
  (11313, 26205)	1
  (11313, 71786)	1
  (11313, 78365)	1
  (11313, 89465)	1
  (11313, 56719)	1
  (11313, 54033)	1
  (11313, 26208)	1
  (11313, 52230)	1
  (11313, 4486)	1
  (11313, 96707)	1


## Training the Naive Bayes model on the Training set

In [44]:
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [45]:
y_pred = classifier.predict(X_test.toarray())

## Making the Confusion Matrix

In [46]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[120   2   0   5  18   2   2   5   4   4   4   4   2   2   9  62   8  16
   19  31]
 [  1 222   4  23  35  33   4   1   4   2   0  22   9   9  15   2   1   1
    1   0]
 [  3  63  27  95  64  52  15   1   1   4   2  16   9  16  14   5   1   0
    5   1]
 [  0  24   7 217  75  11  14   1   5   1   2   9  19   4   2   0   0   0
    1   0]
 [  0  25   2  37 230   7  15   7   3   6   3   7  20   7  10   0   3   0
    1   2]
 [  0  58   3  12  25 254   2   1   4   6   0  11   2   8   7   0   1   1
    0   0]
 [  1  15   0  39  52   8 199  15  12   5   4   5   9   9  10   0   1   4
    2   0]
 [  2   3   0   1  38   0  11 243  40   8   0   6  14   2  10   6   3   1
    8   0]
 [  8   1   0   1  27   1  10  55 232   8   4   4  10   2   4   1  10   6
    7   7]
 [  5   1   0   0  24   1   2   1   5 297  16   2   1   5   6   3   9   2
   15   2]
 [  5   2   0   1  15   0   5   5   2  14 327   2   2   2   2   1   3   6
    3   2]
 [ 11   4   0   5  29   3   2   5  14   4   4 248  11   5  11   3

## Visualising the results

In [None]:
plot_confusion_matrix(classifier, X_test.toarray(), y_test, display_labels=['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc'], cmap=plt.cm.Blues) 