In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
from collections import Counter 
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
import text_normalizer as tn
import model_evaluation_utils as meu
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**Loading**

In [3]:
data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))

corpus, target_labels, target_names = (data.data, data.target, [data_labels_map[label] for label in data.target])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)

data_df = data_df[~(data_df.Article.str.strip() == '')]
data_df.shape

import nltk
stopword_list = nltk.corpus.stopwords.words('english')
# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')

# normalize our corpus
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True, contraction_expansion=True, 
                                  accented_char_removal=True, text_lower_case=True, text_lemmatization=True, 
                                  text_stemming=False, special_char_removal=True, remove_digits=True,
                                  stopword_removal=True, stopwords=stopword_list)
data_df['Clean Article'] = norm_corpus

data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()

data_df = data_df.dropna().reset_index(drop=True)
data_df.info()

data_df.to_csv('clean_newsgroups.csv', index=False)

data_df = pd.read_csv('clean_newsgroups.csv')

from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

from collections import Counter

trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'], ascending=False))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

(18846, 3)
Empty documents: 515
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18331 entries, 0 to 18845
Data columns (total 4 columns):
Article          18331 non-null object
Clean Article    18301 non-null object
Target Label     18331 non-null int64
Target Name      18331 non-null object
dtypes: int64(1), object(3)
memory usage: 716.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301 entries, 0 to 18300
Data columns (total 4 columns):
Article          18301 non-null object
Clean Article    18301 non-null object
Target Label     18301 non-null int64
Target Name      18301 non-null object
dtypes: int64(1), object(3)
memory usage: 572.0+ KB


In [4]:
# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

In [6]:
# transform test articles into features
cv_test_features = cv.transform(test_corpus)

In [7]:
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

CV Accuracy (5-fold): [0.68346201 0.64343928 0.67890657 0.67292773 0.66721177]
Mean CV Accuracy: 0.6691894717585705
Test Accuracy: 0.6829470198675497


In [8]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.69687119 0.67807661 0.70624235 0.70436913 0.69133279]
Mean CV Accuracy: 0.6953784137468023
Test Accuracy: 0.7092715231788079


In [9]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.62941894 0.63284434 0.65524276 0.64434463 0.63941128]
Mean CV Accuracy: 0.6402523886836645
Test Accuracy: 0.6582781456953642


In [10]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.63266965 0.61572942 0.65442676 0.63699469 0.63614064]
Mean CV Accuracy: 0.6351922323821617
Test Accuracy: 0.6514900662251656


In [11]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.50589191 0.50407498 0.54222766 0.51490404 0.51798855]
Mean CV Accuracy: 0.5170174301730878
Test Accuracy: 0.5399006622516557


In [12]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

CV Accuracy (5-fold): [0.54896384 0.52893236 0.55079559 0.56390363 0.56091578]
Mean CV Accuracy: 0.5507022399631334
Test Accuracy: 0.5521523178807947


In [13]:
# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

In [14]:
# transform test articles into features
tv_test_features = tv.transform(test_corpus)

In [15]:
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (12261, 73688)  Test features shape: (6040, 73688)


In [16]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.71556278 0.68500407 0.71521828 0.71661903 0.70809485]
Mean CV Accuracy: 0.7080998018997546
Test Accuracy: 0.7115894039735099


In [17]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.74197481 0.71638142 0.75112199 0.74887709 0.73998365]
Mean CV Accuracy: 0.7396677911134283
Test Accuracy: 0.7504966887417218


In [18]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.75782202 0.73390383 0.7629539  0.76847693 0.75183974]
Mean CV Accuracy: 0.7549992836249155
Test Accuracy: 0.7693708609271523


In [19]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.7586347  0.73879381 0.76784986 0.77501021 0.7526574 ]
Mean CV Accuracy: 0.7585891945315526
Test Accuracy: 0.7672185430463576


In [20]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.50832995 0.5199674  0.53896369 0.519804   0.52902698]
Mean CV Accuracy: 0.5232184040183732
Test Accuracy: 0.5443708609271524


In [21]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.54611946 0.53096985 0.55854753 0.57329522 0.5625511 ]
Mean CV Accuracy: 0.5542966333572024
Test Accuracy: 0.5572847682119205


In [22]:
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.669189,0.695378,0.640252,0.635192,0.517017,0.550702
Test Score (TF),0.682947,0.709272,0.658278,0.65149,0.539901,0.552152
CV Score (TF-IDF),0.7081,0.739668,0.754999,0.758589,0.523218,0.554297
Test Score (TF-IDF),0.711589,0.750497,0.769371,0.767219,0.544371,0.557285


**Model Tuning**

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2)
gs_mnb = gs_mnb.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   3.2s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s remaining:    0.0s


[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   3.1s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   3.0s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   3.8s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   4.0s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=  12.4s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=  11.3s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=  12.1s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  9.4min finished


In [25]:
gs_mnb.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('mnb', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=No

In [26]:
cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                           'params': cv_results['params'], 
                           'cv score (mean)': cv_results['mean_test_score'], 
                           'cv score (std)': cv_results['std_test_score']} 
              )
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df

Unnamed: 0,rank,params,cv score (mean),cv score (std)
5,1,"{'mnb__alpha': 0.01, 'tfidf__ngram_range': (1, 2)}",0.76829,0.009847
4,2,"{'mnb__alpha': 0.01, 'tfidf__ngram_range': (1, 1)}",0.76519,0.010087
6,3,"{'mnb__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}",0.755077,0.012249
7,4,"{'mnb__alpha': 0.1, 'tfidf__ngram_range': (1, 2)}",0.752141,0.010325
3,5,"{'mnb__alpha': 0.0001, 'tfidf__ngram_range': (1, 2)}",0.751325,0.009224
1,6,"{'mnb__alpha': 1e-05, 'tfidf__ngram_range': (1, 2)}",0.741294,0.009413
2,7,"{'mnb__alpha': 0.0001, 'tfidf__ngram_range': (1, 1)}",0.73901,0.013688
0,8,"{'mnb__alpha': 1e-05, 'tfidf__ngram_range': (1, 1)}",0.726123,0.014524
8,9,"{'mnb__alpha': 1, 'tfidf__ngram_range': (1, 1)}",0.709159,0.011452
9,10,"{'mnb__alpha': 1, 'tfidf__ngram_range': (1, 2)}",0.702308,0.010658


In [27]:
best_mnb_test_score = gs_mnb.score(test_corpus, test_label_names)
print('Test Accuracy :', best_mnb_test_score)

Test Accuracy : 0.7793046357615894


In [29]:
# 

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

lr_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('lr', LogisticRegression(penalty='l2', max_iter=100, random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'lr__C': [1, 5, 10]
}

gs_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gs_lr = gs_lr.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   8.8s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.8s remaining:    0.0s


[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   9.4s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   9.2s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   8.8s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=   9.1s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total=  34.5s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total=  32.2s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total=  33.2s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.1min finished


In [31]:
gs_lr.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,..., penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [32]:
best_lr_test_score = gs_lr.score(test_corpus, test_label_names)
print('Test Accuracy :', best_lr_test_score)

Test Accuracy : 0.7695364238410596


In [33]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   4.4s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s remaining:    0.0s


[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   4.3s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   4.2s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   4.3s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 1) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 1), total=   4.3s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=  15.4s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=  15.0s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] ........... svm__C=0.01, tfidf__ngram_range=(1, 2), total=  15.4s
[CV] svm__C=0.01, tfidf__ngram_range=(1, 2) ..........................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 11.7min finished


In [34]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svm', LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
        verbose=0))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 2), norm='l2'

In [35]:
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)

Test Accuracy : 0.7839403973509934


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

sgd_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd', SGDClassifier(random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'sgd__alpha': [1e-7, 1e-6, 1e-5, 1e-4]
}

gs_sgd = GridSearchCV(sgd_pipeline, param_grid, cv=5, verbose=2)
gs_sgd = gs_sgd.fit(train_corpus, train_label_names)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s


[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   3.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   3.3s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   3.4s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 1) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 1), total=   3.8s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=  13.7s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=  12.5s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] ...... sgd__alpha=1e-07, tfidf__ngram_range=(1, 2), total=  13.2s
[CV] sgd__alpha=1e-07, tfidf__ngram_range=(1, 2) .....................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  7.9min finished


In [37]:
gs_sgd.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
          early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
          l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
          n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
          power_t=0.5, random_state=42, shuffle=True, tol=None,
          validation_fraction=0.1, verbose=0, warm_start=False))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_e

In [38]:
best_sgd_test_score = gs_sgd.score(test_corpus, test_label_names)
print('Test Accuracy :', best_sgd_test_score)

Test Accuracy : 0.7759933774834437


In [40]:
mnb_predictions = gs_mnb.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)

Accuracy: 0.7793
Precision: 0.7892
Recall: 0.7793
F1 Score: 0.7761


In [41]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=mnb_predictions, classes=unique_classes)

                          precision    recall  f1-score   support

 comp.os.ms-windows.misc       0.71      0.69      0.70       304
           comp.graphics       0.66      0.75      0.70       292
         rec.motorcycles       0.83      0.81      0.82       311
   comp.sys.mac.hardware       0.79      0.75      0.77       296
               sci.space       0.84      0.80      0.82       335
      talk.religion.misc       0.80      0.28      0.42       193
      rec.sport.baseball       0.94      0.89      0.91       310
comp.sys.ibm.pc.hardware       0.73      0.71      0.72       343
            misc.forsale       0.85      0.76      0.80       322
               sci.crypt       0.74      0.91      0.82       287
        rec.sport.hockey       0.90      0.92      0.91       318
  soc.religion.christian       0.59      0.93      0.72       318
         sci.electronics       0.78      0.77      0.77       311
      talk.politics.misc       0.73      0.70      0.71       260
         

In [42]:
label_data_map = {v:k for k, v in data_labels_map.items()}
label_map_df = pd.DataFrame(list(label_data_map.items()), columns=['Label Name', 'Label Number'])
label_map_df

Unnamed: 0,Label Name,Label Number
0,alt.atheism,0
1,comp.graphics,1
2,comp.os.ms-windows.misc,2
3,comp.sys.ibm.pc.hardware,3
4,comp.sys.mac.hardware,4
5,comp.windows.x,5
6,misc.forsale,6
7,rec.autos,7
8,rec.motorcycles,8
9,rec.sport.baseball,9


In [43]:
unique_class_nums = label_map_df['Label Number'].values
mnb_prediction_class_nums = [label_data_map[item] for item in mnb_predictions]
meu.display_confusion_matrix_pretty(true_labels=test_label_nums, 
                                   predicted_labels=mnb_prediction_class_nums, classes=unique_class_nums)

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Actual:,0,161,3,2,1,0,1,0,3,3,0,4,6,2,1,3,60,11,14,6,9
Actual:,1,3,218,11,12,4,17,2,0,1,2,1,9,1,1,7,1,2,0,0,0
Actual:,2,1,20,210,27,12,20,0,0,0,0,0,5,3,1,2,1,0,0,2,0
Actual:,3,1,15,27,245,24,3,7,3,0,0,0,5,12,0,1,0,0,0,0,0
Actual:,4,0,14,9,14,223,3,9,2,1,1,0,7,7,0,2,1,1,0,2,0
Actual:,5,1,20,12,4,2,258,1,0,0,0,0,4,4,0,2,2,0,0,1,0
Actual:,6,0,3,4,17,7,0,244,12,3,0,2,9,7,2,5,3,1,2,1,0
Actual:,7,0,3,3,0,0,0,8,241,19,1,0,2,9,1,0,3,9,0,4,0
Actual:,8,1,3,0,1,0,0,4,13,252,3,4,1,4,4,4,8,4,1,4,0
Actual:,9,1,4,0,1,0,1,0,1,1,275,9,4,1,1,0,4,5,0,2,0


In [44]:
unique_classes = label_map_df['Label Name'].values
meu.display_confusion_matrix_pretty(true_labels=test_label_names, 
                                    predicted_labels=mnb_predictions, classes=unique_classes)

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
Actual:,alt.atheism,161,3,2,1,0,1,0,3,3,0,4,6,2,1,3,60,11,14,6,9
Actual:,comp.graphics,3,218,11,12,4,17,2,0,1,2,1,9,1,1,7,1,2,0,0,0
Actual:,comp.os.ms-windows.misc,1,20,210,27,12,20,0,0,0,0,0,5,3,1,2,1,0,0,2,0
Actual:,comp.sys.ibm.pc.hardware,1,15,27,245,24,3,7,3,0,0,0,5,12,0,1,0,0,0,0,0
Actual:,comp.sys.mac.hardware,0,14,9,14,223,3,9,2,1,1,0,7,7,0,2,1,1,0,2,0
Actual:,comp.windows.x,1,20,12,4,2,258,1,0,0,0,0,4,4,0,2,2,0,0,1,0
Actual:,misc.forsale,0,3,4,17,7,0,244,12,3,0,2,9,7,2,5,3,1,2,1,0
Actual:,rec.autos,0,3,3,0,0,0,8,241,19,1,0,2,9,1,0,3,9,0,4,0
Actual:,rec.motorcycles,1,3,0,1,0,0,4,13,252,3,4,1,4,4,4,8,4,1,4,0
Actual:,rec.sport.baseball,1,4,0,1,0,1,0,1,1,275,9,4,1,1,0,4,5,0,2,0


In [45]:
label_map_df[label_map_df['Label Number'].isin([0, 15, 19])]

Unnamed: 0,Label Name,Label Number
0,alt.atheism,0
15,soc.religion.christian,15
19,talk.religion.misc,19


In [46]:
train_idx, test_idx = train_test_split(np.array(range(len(data_df['Article']))), test_size=0.33, random_state=42)
test_idx

array([ 4097,  8528,  7621, ...,  4772,  7800, 14579])

In [47]:
predict_probas = gs_mnb.predict_proba(test_corpus).max(axis=1)
test_df = data_df.iloc[test_idx]
test_df['Predicted Name'] = mnb_predictions
test_df['Predicted Confidence'] = predict_probas
test_df.head()

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
4097,one way to get the system going with one floppy drive and one hard\ndisk on a 63 watt power supp...,one way get system go one floppy drive one hard disk watt power supply first disconnect power fl...,3,comp.sys.ibm.pc.hardware,comp.sys.ibm.pc.hardware,0.615344
8528,"\n\n\n\nI've already written a 5000 char commentary (from my MCI MAIL account, so I\ncan't be ac...",already write char commentary mci mail account not accuse hacker,11,sci.crypt,sci.crypt,0.939236
7621,\n\n\tWhat alternative would you suggest be taken to safeguard the\nlives of Israeli citizens?\n...,alternative would suggest take safeguard life israeli citizen adam adam shostack adamdas harvard...,17,talk.politics.mideast,talk.politics.mideast,0.999789
4754,I have already purchased 72-pin SIMMs for a Quadra 800 from\nMemory Direct (on March 9). How ca...,already purchase pin simms quadra memory direct march tell whether not composite simms rule memo...,4,comp.sys.mac.hardware,comp.sys.mac.hardware,0.978512
15905,I'd say the subject line about covers it. I need a MC68881 Floating point\n Co-processor for ...,would say subject line cover need mc float point co processor sun system please reply via email ...,6,misc.forsale,misc.forsale,0.414476


In [48]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'soc.religion.christian')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
14514,"iank@microsoft.com (Ian Kennedy) writes...\n\n\nMore along the lines of Hebrews 12:25-29, I reckon...\n\n\tSee that you refuse not him that speaks. For if they\n\tescaped not who refused him that ...",iankmicrosoft com ian kennedy write along line hebrews reckon see refuse not speak escape not refuse spake earth much shall not escape turn away speak heaven whose voice shake earth promise saying...,19,talk.religion.misc,soc.religion.christian,0.999972
4367,:\n (lots of stuff about the Nicene Creed deleted which can be read in the\n original basenote. I will also leave it up to other LDS netters to\n take Mr. Weiss to task on using Mormon Doctrine...,lot stuff nicene creed delete read original basenote also leave lds netter take mr weiss task use mormon doctrine declare difinitive word lds church teach doctrine hopefully lds netter amiable exp...,19,talk.religion.misc,soc.religion.christian,0.998592
4186,"\nRick, I think we can safely say, 1) Robert is not the only person\nwho understands the Bible, and 2), the leadership of the LDS church\nhistoricly never has. Let's consider some ""personal inter...",rick think safely say robert not person understand bible leadership lds church historicly never let consider personal interpretation see much trust put orthodox mormonism could never confuse ortho...,19,talk.religion.misc,soc.religion.christian,0.99816
8968,"\nZoroaster is far older than Daniel. If anything, one could claim that,\nin a sense, Daniel is a descendant of Zoroaster; as Daniel, though being\nHebrew, has assimilated into Zoroastrianism and ...",zoroaster far old daniel anything one could claim sense daniel descendant zoroaster daniel though hebrew assimilate zoroastrianism successfully introduce religion tanakh judaism however majority b...,19,talk.religion.misc,soc.religion.christian,0.998123
16679,"\nJesus did not say that he was the fulfillment of the Law, and, unless\nI'm mistaken, heaven and earth have not yet passed away. Am I mistaken?\nAnd, even assuming that one can just gloss over th...",jesus not say fulfillment law unless mistaken heaven earth not yet pass away mistaken even assume one gloss portion word jesus really think accomplish not jesus say jew annul v say jesus record wo...,19,talk.religion.misc,soc.religion.christian,0.998123


In [49]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'alt.atheism')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
2467,"Why is the NT tossed out as info on Jesus. I realize it is normally tossed\nout because it contains miracles, but what are the other reasons?\n\nMAC\n--\n*****************************************...",nt toss info jesus realize normally toss contain miracle reason mac michael cobb not raise taxis middle university illinois class pay program champaign urbana bill clinton rd debate cobbalexia lis...,19,talk.religion.misc,alt.atheism,0.99998
3429,"\n\n\nIs there any difference in saying \n\n""Absolute Truth exists, but some people think its a lie""\n\nand\n\n""Truth is relative"" ?\n\nI think there is: in both examples, the first statement is ...",difference say absolute truth exist people think lie truth relative think example first statement fundamental disagreement least two people second statement agree upon put another way someone say ...,19,talk.religion.misc,alt.atheism,0.854971
1675,"\n[...stuff deleted...]\n\nAndy-- I think we do agree, given your clarification of how we were \neach using the terms fact and theory. I'll only add that I\nthink perhaps I feel more strongly abo...",stuff delete andy think agree give clarification use term fact theory add think perhaps feel strongly separate though usage quite valid add falsification rejection not way reduce current usefulnes...,19,talk.religion.misc,alt.atheism,0.853153
12384,"\nAh, you taking everything as literal quotation. No wonder you're confused.\n\nFirst, can I ask that we decide on a definition of ""objective""?\n\n\nAnd?\n\n\nI'd guess that it might be.\n\n\nIt ...",ah take everything literal quotation no wonder confused first ask decide definition objective would guess might may case people unable evaluate complex moral issue rather leave behave immorally mi...,19,talk.religion.misc,alt.atheism,0.807862
14774,[ deleted ]\n[ deleted ]\n\n Evolution is both fact and theory. The THEORY of evolution represents the\nscientific attempt to explain the FACT of evolution. The theory of evolution\ndoes not pr...,delete delete evolution fact theory theory evolution represent scientific attempt explain fact evolution theory evolution not provide fact explain fact safely assume scientific theory neither prov...,19,talk.religion.misc,alt.atheism,0.743446


**Application**

Text Classification and Categorization are used in several real-world senarios and applications. some of them as follows:
    <br>
    1. New Categorization
    2. Spam Filtering
    3. Music or movie genre categorization
    4. Sentiment analysis
    5. Language detection