# Bag of Words NLP Approach

In [35]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

## Import 20 News Groups dataset from scikitlearn
Focusing on 4 of the news categories:  
- Gun Politics
- Christian
- Graphics
- Science Medical

In [36]:
categories = ['talk.politics.guns', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [37]:
print(twenty_train.target_names) # news categories used
print(len(twenty_train.data))
print(len(twenty_train.filenames))

['comp.graphics', 'sci.med', 'soc.religion.christian', 'talk.politics.guns']
2323
2323


#### Sample Input from Imported Data

In [38]:
#print("\n".join(twenty_train.data[0].split("\n")[:3]))
print("\n".join(twenty_train.data[2].split("\n")))

From: Petch@gvg47.gvg.tek.com (Chuck Petch)
Subject: Daily Verse
Organization: Grass Valley Group, Grass Valley, CA
Lines: 4

For whoever does the will of my Father in heaven is my brother and sister
and mother." 

Matthew 12:50



#### Sample Label from Imported Data

In [39]:
print(twenty_train.target_names[twenty_train.target[2]])

soc.religion.christian


## Build tf_idf Document Term Matrix
Count Vectorizer: stores counts of vocabulary within a document; strips non-ascii characters, lowercases all characters, removes common stop words in english dictionary, includes n-grams feature lengths of up to 3  
TfidfTransformer: transforms count vector into a normalized tfidf representation  

In [40]:
count_vect = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english', ngram_range=(1, 3))
tfidf_transformer = TfidfTransformer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

#### Shape of Document Term Matrix;
Try changing the ngram_range; see how the number of columsn in the document term matrix changes

In [41]:
print(x_train_tfidf.shape)

(2323, 580346)


## Fit Logistic Regression Model to predict news category

In [42]:
text_clf = LogisticRegression()
text_clf.fit(x_train_tfidf, twenty_train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#### Test our gradient boosted classifier model on sample texts
Try putting some other string texts into "docs_new" and see which category our model predicts.

In [43]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Gun violence is a real problem', 'Doctors cure critical brain tumor in patient', 'Space Radiation Doesnt Seem to Be Causing Astronauts to Die from Cancer, Study Finds']
X_new_counts = count_vect.transform(docs_new) # only use transform because transformers are already fitted to dataset
X_new_tfidf = tfidf_transformer.transform(X_new_counts) # only use transform because transformers are already fitted to dataset

predicted = text_clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Gun violence is a real problem' => talk.politics.guns
'Doctors cure critical brain tumor in patient' => sci.med
'Space Radiation Doesnt Seem to Be Causing Astronauts to Die from Cancer, Study Finds' => sci.med


#### Evaluate our logistic regression model on test data

In [44]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
test_counts = count_vect.transform(docs_test)
test_tfidf = tfidf_transformer.transform(test_counts)
predicted = text_clf.predict(test_tfidf)
np.mean(predicted == twenty_test.target) 

0.936005171299289

## Fit Gradient Boosted Decision Tree to predict news category

In [11]:
# TODO: instead of using Logistic Regression, train a "GradientBoostingClassifier" with parameter n_estimators=10
# Lookup sklearn.ensemble.GradientBoostingClassifier
## text_clf = ...(n_estimators=...)


# TODO: fit text classifier to training data; pass in document term matrix "x_train_tfidf" and corresponding labels "twenty_train.target"
## text_clf.fit(..., ...)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

#### Test our gradient boosted classifier model on sample texts
Try putting some other string texts into "docs_new" and see which category our model predicts.

In [12]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Gun violence is a real problem', 'Doctors cure critical brain tumor in patient', 'Space Radiation Doesnt Seem to Be Causing Astronauts to Die from Cancer, Study Finds']
X_new_counts = count_vect.transform(docs_new) # only use transform because transformers are already fitted to dataset
X_new_tfidf = tfidf_transformer.transform(X_new_counts) # only use transform because transformers are already fitted to dataset

predicted = text_clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => sci.med
'Gun violence is a real problem' => talk.politics.guns
'Doctors cure critical brain tumor in patient' => sci.med
'Space Radiation Doesnt Seem to Be Causing Astronauts to Die from Cancer, Study Finds' => sci.med


#### Evaluate our gradient boosted classifier on test data

In [13]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
test_counts = count_vect.transform(docs_test)
test_tfidf = tfidf_transformer.transform(test_counts)
predicted = text_clf.predict(test_tfidf)
np.mean(predicted == twenty_test.target) 

0.8332255979314803

## Trying out other news groups:
We only used 4 of the news groups! You can change which news groups we want to use to train our models at the beginning of this notebook in the import step.  
Here are all 20 of the available news groups from the dataset.  
<img src="./screenshots/20news_groups.png" alt="drawing" width="500"/>

## Trying out more models:
Here's a couple of other models available in scikit-learn you can try out in your spare time:  
- Naive Bayes
- Decision Tree
- Random Forest
- XGBoost