<a href="https://colab.research.google.com/github/yashwanth-kokkanti/machinelearning_practise/blob/master/bow_sklearnPractise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This Notebook demonstrates skelearn Countvectorizer 

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
len(twenty_train.filenames)

2257

In [5]:
##twenty_train

In [6]:
## There is huge text data here . Lets see what is the first target 

print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [7]:
## Example of Bag of Words using Countvectorizer.

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

x_train_counts = count_vect.fit_transform(twenty_train.data)

x_train_counts.shape

(2257, 35788)

In [8]:
## Example of Tf

from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)

x_train_tf.shape


(2257, 35788)

In [9]:
## Example of Tf-idf 

tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

x_train_tfidf.shape


(2257, 35788)

In [10]:
## Train a Classifier using Naive Bayes Theorem

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(x_train_tfidf, twenty_train.target)

In [11]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'my name is Yashwanth', 'Good Morning']

x_new_counts = count_vect.transform(docs_new)
x_new_tfidf = tfidf_transformer.transform(x_new_counts)

predicted = clf.predict(x_new_tfidf)



In [12]:
predicted

array([3, 1, 3, 3])

In [13]:
for doc, category in zip (docs_new, predicted):
  print ('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'my name is Yashwanth' => soc.religion.christian
'Good Morning' => soc.religion.christian


In [14]:
## Building a pipeline and Evaluating perrformance 

from sklearn.pipeline import Pipeline
import numpy as np 

text_clf = Pipeline([
      ('vect', CountVectorizer()),
      ('tfidf', TfidfTransformer()),
      ('clf', MultinomialNB()),
          ])

text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

docs_test = twenty_test.data

predicted = text_clf.predict(docs_test)

np.mean(predicted == twenty_test.target)

0.8348868175765646

In [15]:
## With Multinomial NB we achieved 83 % accuracy . 

In [16]:
## Lets see with SVM . 


from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
              ('vect', CountVectorizer()),
              ('tfidf', TfidfTransformer()),
              ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),
                  ])

text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(docs_test)

np.mean(predicted == twenty_test.target)

0.9101198402130493

In [17]:
## With SVM Around 91 %

In [18]:
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([
              ('vect', CountVectorizer()),
              ('tfidf', TfidfTransformer()),
              ('clf', LogisticRegression(penalty='l2', random_state=42)),
                  ])

text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(docs_test)

np.mean(predicted == twenty_test.target)

0.8974700399467377

In [19]:
## With Logistic Regression it is 89.7 % Which is slightly equal to SVM . 

In [20]:
from sklearn.tree import DecisionTreeClassifier

text_clf = Pipeline([
              ('vect', CountVectorizer()),
              ('tfidf', TfidfTransformer()),
              ('clf', DecisionTreeClassifier(random_state=42)),
                  ])

text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(docs_test)

np.mean(predicted == twenty_test.target)

0.7210386151797603

In [21]:
## This says that Decission Trees are poor in TestClassification . 

In [22]:
from sklearn import metrics 

print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.66      0.65      0.66       319
         comp.graphics       0.72      0.80      0.76       389
               sci.med       0.68      0.59      0.63       396
soc.religion.christian       0.81      0.83      0.82       398

              accuracy                           0.72      1502
             macro avg       0.72      0.72      0.72      1502
          weighted avg       0.72      0.72      0.72      1502



In [23]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[206,  22,  46,  45],
       [ 21, 311,  47,  10],
       [ 58,  78, 235,  25],
       [ 25,  22,  20, 331]])