# Basic Text Classification

A tutorial I'm following from [this website](https://nlpforhackers.io/text-classification/). Uses scikit-learn.

Let's see how it turns out.

In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize

import string

Using sklearn's inbuilt 20_newsgroups dataset for simplicity. It contains a train, and a test subset, that we'll not be using. For now, we explore the entire dataset as it is to figure out what to classify.

In [2]:
news = fetch_20newsgroups(subset='all')
# news

print("Total data points available:" , len(news.data))
print("\nTarget Names\n", news.target_names, " in ", len(news.target_names), " classes.")
print("\nA small sample:")

for text, num_label in zip(news.data[:10], news.target[:10]):
    print('[{0}]:\t\t "{1} ..."'.format(news.target_names[num_label], text[:100].split('\n')[0]))

Total data points available: 18846

Target Names
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']  in  20  classes.

A small sample:
[rec.sport.hockey]:		 "From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu> ..."
[comp.sys.ibm.pc.hardware]:		 "From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson) ..."
[talk.politics.mideast]:		 "From: hilmi-er@dsv.su.se (Hilmi Eren) ..."
[comp.sys.ibm.pc.hardware]:		 "From: guyd@austin.ibm.com (Guy Dawson) ..."
[comp.sys.mac.hardware]:		 "From: Alexander Samuel McDiarmid <am2o+@andrew.cmu.edu> ..."
[sci.electronics]:		 "From: tell@cs.unc.edu (Stephen Tell) ..."
[comp.sys.mac.hardware]:		 "From: lpa8921@tamuts.

Used to train and test the accuracy of the classifier. Uses train_test_split. because why not lmao

In [3]:
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    
    classifier.fit(X_train, y_train)
    print("Accuracy: ", classifier.score(X_test, y_test))
    return classifier

## Multinomial Naive Bayes classifier

### Trial 1
The first pipeline uses all text. The classifier pipeline has a TF-IDF vectorizer and then a Naive Bayes classifier.

In [4]:
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])

train(trial1, news.data, news.target)

Accuracy:  0.8463497453310697


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

### Trial 2
The next pipeline uses the notion of stopwords: eliminates all stopwords at the TF-IDF phase of the pipeline using NLTK's stopwords, imported earlier.

In [5]:
trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB()),
])

train(trial2, news.data, news.target)

Accuracy:  0.8777589134125636


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

### Trial 3

Alongside the optimizations of `Trial 2`, we work with the `alpha` parameter of the Naive-Bayes classifier.

In [6]:
trial3 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=0.05)),
])

train(trial3, news.data, news.target)

Accuracy:  0.9102292020373515


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...     vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])

### Trial 4

Full preprocessing suite: `tokenizing`, `stemming`, `stopwords`, `punctuation` 

In [7]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in word_tokenize(text)]

trial5 = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
                                  stop_words=(stopwords.words('english')+list(string.punctuation)))),
    ('classifier', MultinomialNB(alpha=0.05))
])

train(trial5, news.data, news.target)

  'stop_words.' % sorted(inconsistent))


Accuracy:  0.9110780984719864


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...rue, vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])

#### Note:
The last trial (trial 4) seemed to take much longer than anticipated. Wonder why.