In [19]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import text_normalize as tn
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import nltk
from collections import Counter
from IPython.core.debugger import set_trace
# Model building
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
# %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Retrieval and Preprocessing

We will strip HTML markups, expand contractions such as can't into cannot, remove accented character, lowercase all words, lemmatize, remove special characters and stopwords from out text corpus.

In [20]:
data = fetch_20newsgroups(subset='all', shuffle=True,
                          remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))

In [21]:
corpus, target_labels, target_names = (data.data, data.target,
                                       [data_labels_map[label] for label in data.target])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

(18846, 3)


Unnamed: 0,Article,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,19,talk.religion.misc


In [22]:
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)

Empty documents: 515


In [23]:
# removing nan values
data_df = data_df[~(data_df.Article.str.strip() == '')]
data_df.shape

(18331, 3)

In [24]:
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

### Single Core

The preprocessing step takes about 20 minutes using one core.

In [9]:
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'])
data_df['Clean Article1'] = norm_corpus

We save the new clean data into column Clean Article1. Below is what the data look like after processing.

In [10]:
data_df = data_df[['Article', 'Clean Article1', 'Target Label', 'Target Name']]
data_df.head(10)

Unnamed: 0,Article,Clean Article1,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,sure basher pen fan pretty confused lack kind ...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,brother market high performance video card sup...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,&#9; finally say dream mediterranean new &#9; ...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,think scsi card dma transfer disk scsi card dm...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,old jasmine drive use new system understanding...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,back high school work lab assistant bunch expe...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,ae dalla try tech support may line one get start,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",[ stuff delete ] ok solution problem move cana...,10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",yeah second one believe price try get good loo...,10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,christian mean someone believe divinity jesus ...,19,talk.religion.misc


### Parallel Preprocessing

There's a significant decrease in time processing text with multi-core.

In [25]:
norm_corpus = tn.parallel_normalize_corpus(n_processes=6,corpus=data_df['Article'].values)
data_df['Clean Article'] = norm_corpus

Took 535.3095 seconds with 6 process(es).


In [33]:
# Save clean data to a csv file for later use
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
display(data_df.head(10))
data_df.to_csv('Clean_data.csv', index=None, header=True)

Unnamed: 0,Article,Clean Article,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,sure basher pen fan pretty confused lack kind ...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,brother market high performance video card sup...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,&#9; finally say dream mediterranean new &#9; ...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,think scsi card dma transfer disk scsi card dm...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,old jasmine drive use new system understanding...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,back high school work lab assistant bunch expe...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,ae dalla try tech support may line one get start,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",[ stuff delete ] ok solution problem move cana...,10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",yeah second one believe price try get good loo...,10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,christian mean someone believe divinity jesus ...,19,talk.religion.misc


In [27]:
# Read data back in
data_df = pd.read_csv('Clean_data.csv')
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18331 entries, 0 to 18330
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18331 non-null  object
 1   Clean Article  18305 non-null  object
 2   Target Label   18331 non-null  int64 
 3   Target Name    18331 non-null  object
dtypes: int64(1), object(3)
memory usage: 573.0+ KB


In [28]:
# Dropping nan values
data_df = data_df.dropna().reset_index(drop=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18305 entries, 0 to 18304
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18305 non-null  object
 1   Clean Article  18305 non-null  object
 2   Target Label   18305 non-null  int64 
 3   Target Name    18305 non-null  object
dtypes: int64(1), object(3)
memory usage: 572.2+ KB


In [29]:
# Splitting Traing and Test sets
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(np.array(data_df['Clean Article']),
                                         np.array(data_df['Target Label']),
                                         np.array(data_df['Target Name']),
                                         test_size=0.33, random_state=42)
train_corpus.shape, test_corpus.shape

((12264,), (6041,))

In [30]:
# Distribution of data after preprocessing
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))
(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,Target Label,Train Count,Test Count
12,rec.sport.hockey,686,288
11,sci.crypt,665,297
7,soc.religion.christian,664,310
2,comp.graphics,658,295
0,rec.motorcycles,653,316
14,comp.windows.x,649,331
18,rec.autos,642,293
3,rec.sport.baseball,636,315
17,comp.sys.ibm.pc.hardware,634,329
1,sci.electronics,634,322


## Feature Engineering

### Bag of Words

In [31]:
# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
# transform test articles into features
cv_test_features = cv.transform(test_corpus)
print('BOW model:> Train features shape:', cv_train_features.shape,
      ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (12264, 72430)  Test features shape: (6041, 72430)


There are 72430 unique words in the corpus

In [34]:
# Naive Bayes Classifier
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

CV Accuracy (5-fold): [0.66517676 0.65527066 0.67576375 0.66149449 0.67567568]
Mean CV Accuracy: 0.6666762646724255
Test Accuracy: 0.6823373613640126


In [35]:
# Logistic Regression
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.69402682 0.68416768 0.71242363 0.69252756 0.6981982 ]
Mean CV Accuracy: 0.6962687776514593
Test Accuracy: 0.710643933123655


In [36]:
# Support Vector Machines
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.63348232 0.63003663 0.65580448 0.64679461 0.63963964]
Mean CV Accuracy: 0.6411515369262697
Test Accuracy: 0.6553550736633008


In [37]:
# SVM with Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty="l2", max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.62454287 0.62108262 0.64154786 0.64189465 0.63472563]
Mean CV Accuracy: 0.6327587273891667
Test Accuracy: 0.6571759642443304


In [38]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.52783421 0.50834351 0.5299389  0.50918742 0.53931204]
Mean CV Accuracy: 0.5229232170063212
Test Accuracy: 0.5326932627048502


In [39]:
# Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

CV Accuracy (5-fold): [0.54530679 0.55677656 0.54949084 0.54920376 0.54709255]
Mean CV Accuracy: 0.5495740962788843
Test Accuracy: 0.5515643105446119


### Tf-Idf 

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)
# transform test articles into features
tv_test_features = tv.transform(test_corpus)
print('TFIDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (12264, 72430)  Test features shape: (6041, 72430)


Again, there are 72430 unique tf-idf features in the corpus

In [41]:
# Naïve Bayes
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.70174726 0.6955637  0.73197556 0.70681911 0.70966421]
Mean CV Accuracy: 0.7091539664725277
Test Accuracy: 0.7194173150140705


In [42]:
# Logistic Regression
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, 
                                     train_label_names, cv=5,
                                    n_jobs=6)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.73587972 0.72364672 0.75356415 0.74642711 0.74160524]
Mean CV Accuracy: 0.7402245913670127
Test Accuracy: 0.7505379903989405


In [43]:
# Support Vector Machines
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5, n_jobs = 6)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.74766355 0.73992674 0.77759674 0.76112699 0.75962326]
Mean CV Accuracy: 0.7571874565808953
Test Accuracy: 0.7664293991061083


In [44]:
# SVM with Stochastic Gradient Descent
svm_sgd = SGDClassifier(loss='hinge', penalty="l2", max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5, n_jobs=6)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.75497765 0.74277574 0.77230143 0.76847693 0.75675676]
Mean CV Accuracy: 0.7590577011829144
Test Accuracy: 0.7639463664956133


In [45]:
# Random Forest
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5, n_jobs=6)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.52580252 0.55270655 0.53564155 0.5320539  0.51924652]
Mean CV Accuracy: 0.5330902077333027
Test Accuracy: 0.5437841416983943


In [46]:
# Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5, n_jobs=6)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.54815116 0.55311355 0.55885947 0.55042875 0.55159705]
Mean CV Accuracy: 0.5524299959343953
Test Accuracy: 0.5542128786624732


### Text Classification with Keras

In [29]:
data_df['Article']

0        \n\nI am sure some bashers of Pens fans are pr...
1        My brother is in the market for a high-perform...
2        \n\n\n\n\tFinally you said what you dream abou...
3        \nThink!\n\nIt's the SCSI card doing the DMA t...
4        1)    I have an old Jasmine drive which I cann...
                               ...                        
18300    DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...
18301    \nNot in isolated ground recepticles (usually ...
18302    I just installed a DX2-66 CPU in a clone mothe...
18303    \nWouldn't this require a hyper-sphere.  In 3-...
18304    After a tip from Gary Crum (crum@fcom.cc.utah....
Name: Article, Length: 18305, dtype: object

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100                                           
training_samples = 200                                 
validation_samples = 10000                             
max_words = 10000   

### Not now

In [32]:
def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        for word in words:
            if word in vocabulary:
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

In [35]:
# tokenize corpus
tokenized_train = [tn.tokenizer.tokenize(text)
                   for text in train_corpus]
tokenized_test = [tn.tokenizer.tokenize(text)
                   for text in test_corpus]
# generate word2vec word embeddings
import gensim
# build word2vec model
w2v_num_features = 1000
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=100, min_count=2, sample=1e-3, sg=1, iter=5, workers=10)
# generate document level embeddings
# remember we only use train dataset vocabulary embeddings
# so that test dataset truly remains an unseen dataset
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model, num_features=w2v_num_features)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model, num_features=w2v_num_features)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape,' Test features shape:', avg_wv_test_features.shape)

DistributionNotFound: The 'pyasn1-modules>=0.2.1' distribution was not found and is required by google-auth