## Text Categorization

The dataset comes from sklearn fetch_20newsgroups datasets (Make more modification later)

In [3]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
# import text_normalize as tn
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

warnings.filterwarnings('ignore')
%matplotlib inline

data_df = pd.read_csv('clean_newsgroups.csv')
# Split train and test sets
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(np.array(data_df['Clean Article']),
                                         np.array(data_df['Target Label']),
                                         np.array(data_df['Target Name']),
                                         test_size=0.33, random_state=42)
print(train_corpus.shape, test_corpus.shape)

# Distributions of articles

from collections import Counter
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

print('The distribution of topics from the dataset is:')
(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

The distribution of topics from the dataset is:


Unnamed: 0,Target Label,Train Count,Test Count
12,rec.sport.hockey,686,288
11,sci.crypt,665,297
7,soc.religion.christian,664,310
2,comp.graphics,658,295
0,rec.motorcycles,653,316
14,comp.windows.x,649,331
18,rec.autos,642,293
3,rec.sport.baseball,636,315
17,comp.sys.ibm.pc.hardware,634,329
1,sci.electronics,634,322


### Feature Engineering with Bag of Words (BOW)

In [4]:

# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
# transform test articles into features
cv_test_features = cv.transform(test_corpus)
print('BOW model:> Train features shape:', cv_train_features.shape,
      ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (12264, 73402)  Test features shape: (6041, 73402)


#### Training

##### Multinomial Naive Bayes

In [16]:
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

CV Accuracy (5-fold): [0.6708655  0.65404965 0.67576375 0.66190282 0.67117117]
Mean CV Accuracy: 0.6667505783960094
Test Accuracy: 0.68200629034928


##### Logistic Regression

In [17]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.68508736 0.67969068 0.71120163 0.6957942  0.69164619]
Mean CV Accuracy: 0.692684013048077
Test Accuracy: 0.7103128621089224


##### Support Vector Machines

In [18]:

svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.63023161 0.62189662 0.65539715 0.6439363  0.63963964]
Mean CV Accuracy: 0.6382202647817283
Test Accuracy: 0.6580036417811621


##### SVM with Stochastic Gradient Descent

In [19]:

svm_sgd = SGDClassifier(loss='hinge', penalty="l2", max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.6363267  0.62962963 0.64928717 0.64271131 0.63554464]
Mean CV Accuracy: 0.6386998882841927
Test Accuracy: 0.6351597417646085


##### Random Forest

In [20]:

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.51889476 0.52340252 0.50753564 0.51000408 0.52129402]
Mean CV Accuracy: 0.5162262055544148
Test Accuracy: 0.5307068366164542


##### Gradient Boosting Machines

In [21]:

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

CV Accuracy (5-fold): [0.5558716  0.55555556 0.54582485 0.55206207 0.54340704]
Mean CV Accuracy: 0.5505442218548763
Test Accuracy: 0.5505710975004139


### Feature learning with TF-Idf

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)
# transform test articles into features
tv_test_features = tv.transform(test_corpus)
print('TFIDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (12264, 73402)  Test features shape: (6041, 73402)


#### Multinomial Naïve Bayes

In [23]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.70134092 0.69434269 0.7311609  0.70436913 0.71089271]
Mean CV Accuracy: 0.7084212699897765
Test Accuracy: 0.7204105280582686


#### Logistic Regression

In [25]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.74075579 0.72649573 0.75315682 0.74846876 0.73669124]
Mean CV Accuracy: 0.7411136678173997
Test Accuracy: 0.7518622744578712


#### Support Vector Machines

In [26]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.74928891 0.74074074 0.77148676 0.76194365 0.75266175]
Mean CV Accuracy: 0.7552243625062525
Test Accuracy: 0.7659327925840093


#### SVM with Stochastic Gradient Descent

In [27]:
svm_sgd = SGDClassifier(loss='hinge', penalty="l2", max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.75700935 0.74277574 0.77230143 0.7639853  0.75184275]
Mean CV Accuracy: 0.7575829132394601
Test Accuracy: 0.7659327925840093


#### Random Forest

In [28]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.50873629 0.51933252 0.51446029 0.53736219 0.52538903]
Mean CV Accuracy: 0.5210560609129864
Test Accuracy: 0.5386525409700381


In [42]:
rfc.predict(tv_test_features)

array(['soc.religion.christian', 'misc.forsale', 'soc.religion.christian',
       ..., 'comp.os.ms-windows.misc', 'rec.autos',
       'talk.politics.mideast'], dtype=object)

#### Gradient Boosting

In [30]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.55343356 0.55555556 0.55885947 0.55410372 0.54217854]
Mean CV Accuracy: 0.5528261695193868
Test Accuracy: 0.5530541301109088


#### Summary

In [67]:
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score,
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score,
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score,
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
              columns=['Model', 'CV Score (BOW)', 'Test Score (BOW)',
                      'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (BOW),0.666751,0.692684,0.63822,0.6387,0.516226,0.550544
Test Score (BOW),0.682006,0.710313,0.658004,0.63516,0.530707,0.550571
CV Score (TF-IDF),0.708421,0.741114,0.755224,0.757583,0.521056,0.552826
Test Score (TF-IDF),0.720411,0.751862,0.765933,0.765933,0.538653,0.553054


### Model Tuning

#### Tuning Multinomial Naïve Bayes model

In [32]:

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('mnb', MultinomialNB())
                       ])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
gs_mnb = gs_mnb.fit(train_corpus, train_label_names)

cv_results = gs_mnb.cv_results_
try:
    print(cv_results.keys())
except:
    None
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                           'params': cv_results['params'],
                           'cv score (mean)': cv_results['mean_test_score'],
                           'cv score (std)': cv_results['std_test_score']}
              )
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)

# Test cv score on the test set
best_mnb_test_score = gs_mnb.score(test_corpus, test_label_names)
print('Test Accuracy :', best_mnb_test_score)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished


dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_mnb__alpha', 'param_tfidf__ngram_range', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
Test Accuracy : 0.7790100976659493


#### Tuning Logistic Regression model

In [None]:

lr_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('lr', LogisticRegression(penalty='l2', max_iter=100, random_state=42))
                       ])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'lr__C': [1, 5, 10]
}
gs_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gs_lr = gs_lr.fit(train_corpus, train_label_names)

# evaluate best tuned model on the test dataset
best_lr_test_score = gs_lr.score(test_corpus, test_label_names)
print('Test Accuracy :', best_lr_test_score)

#### Tuning the Linear SVM model

In [34]:
# Tuning the Linear SVM model
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=42))
                       ])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}
gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
gs_svm = gs_svm.fit(train_corpus, train_label_names)
# evaluating best tuned model on the test dataset
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.2min finished


Test Accuracy : 0.7801688462175137


### Model Evaluation

In [97]:
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_curve, auc
from typing import List, TypeVar, Any
from matplotlib.pyplot import figure
Estimator = TypeVar('SklearnEstimator')
class Evaluation:
    def get_metrics(self,true_labels: List[str], predicted_labels: List[str]) -> str:
        """Take the true labels and predicted labels as input and 
        print out the performance metrics for the model"""
        print('Accuracy:', np.round(
            metrics.accuracy_score(true_labels,
                                   predicted_labels),
            4))
        print('Precision:', np.round(
            metrics.precision_score(true_labels,
                                    predicted_labels,
                                    average='weighted'),
            4))
        print('Recall:', np.round(
            metrics.recall_score(true_labels,
                                 predicted_labels,
                                 average='weighted'),
            4))
        print('F1 Score:', np.round(
            metrics.f1_score(true_labels,
                             predicted_labels,
                             average='weighted'),
            4))
    
    def train_predict_model(self,classifier: Estimator ,
                            train_features: List[List[float]], train_labels: List[str],
                            test_features: List[List[float]]) -> List[str]:
        """Train the given model and output predictions based on input features"""
        # build model
        classifier.fit(train_features, train_labels)
        # predict using model
        predictions = classifier.predict(test_features)
        return predictions
    
    def display_confusion_matrix(self,true_labels: List[str], predicted_labels: List[str], classes=[1, 0]) -> pd.DataFrame:
        """Build a confusion matrix using pandas dataframe from the given inputs and output it """
        total_classes = len(classes)
        level_labels = [total_classes * [0], list(range(total_classes))]

        cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
                                      labels=classes)
        cm_frame = pd.DataFrame(data=cm,
                                columns=pd.MultiIndex(levels=[['Predicted:'], classes],
                                                      labels=level_labels),
                                index=pd.MultiIndex(levels=[['Actual:'], classes],
                                                    labels=level_labels))
        print(cm_frame)
        
    def display_classification_report(self,true_labels: List[str], predicted_labels: List[str], classes=[1, 0]) -> str:
        """Take true labels and predicted labels as inputs and output a string of classification report"""
        report = metrics.classification_report(y_true=true_labels,
                                               y_pred=predicted_labels,
                                               labels=classes)
        print(report)
    def display_model_performance_metrics(self,true_labels: List[str], predicted_labels: List[str], classes=[1, 0]) -> Any:
        print('Model Performance metrics:')
        print('-' * 30)
        get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
        print('\nModel Classification report:')
        print('-' * 30)
        display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
                                      classes=classes)
        print('\nPrediction Confusion Matrix:')
        print('-' * 30)
        display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
                                 classes=classes)
    def plot_model_decision_surface(self,clf: Estimator, train_features: List[List[float]], train_labels: List[str],
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None) -> figure:
        if train_features.shape[1] != 2:
            raise ValueError("X_train should have exactly 2 columnns!")

        x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
        y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
        # Create a grid using x, y values
        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))

        clf_est = clone(clf)
        clf_est.fit(train_features, train_labels)
        # Concatnate the results along the columns
        if hasattr(clf_est, 'predict_proba'):
            Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        else:
            Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        cs = plt.contourf(xx, yy, Z, cmap=cmap)
        
        # Encode the labels from the training set
        le = LabelEncoder()
        y_enc = le.fit_transform(train_labels)
        n_classes = len(le.classes_)
        plot_colors = ''.join(colors) if colors else [None] * n_classes
        label_names = le.classes_
        markers = markers if markers else [None] * n_classes
        alphas = alphas if alphas else [None] * n_classes
        # Plot each label with colors, markers and alphas if specified
        for i, color in zip(range(n_classes), plot_colors):
            idx = np.where(y_enc == i)
            plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                        label=label_names[i], cmap=cmap, edgecolors='black',
                        marker=markers[i], alpha=alphas[i])
        plt.legend()
        plt.show()
    
    
    def plot_model_roc_curve(self,clf, features: List[List[float]], true_labels: List[str], label_encoder=None, class_names=None) -> figure:
        ## Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        if hasattr(clf, 'classes_'):
            class_labels = clf.classes_
        elif label_encoder:
            class_labels = label_encoder.classes_
        elif class_names:
            class_labels = class_names
        else:
            raise ValueError('Unable to derive prediction classes, please specify class_names!')
        n_classes = len(class_labels)
        y_test = label_binarize(true_labels, classes=class_labels)
        if n_classes == 2:
            if hasattr(clf, 'predict_proba'):
                prob = clf.predict_proba(features)
                y_score = prob[:, prob.shape[1] - 1]
            elif hasattr(clf, 'decision_function'):
                prob = clf.decision_function(features)
                y_score = prob[:, prob.shape[1] - 1]
            else:
                raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")

            fpr, tpr, _ = roc_curve(y_test, y_score)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
                                     ''.format(roc_auc),
                     linewidth=2.5)

        elif n_classes > 2:
            if hasattr(clf, 'predict_proba'):
                y_score = clf.predict_proba(features)
            elif hasattr(clf, 'decision_function'):
                y_score = clf.decision_function(features)
            else:
                raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")

            for i in range(n_classes):
                fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
                roc_auc[i] = auc(fpr[i], tpr[i])

            ## Compute micro-average ROC curve and ROC area
            fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
            roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

            ## Compute macro-average ROC curve and ROC area
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in range(n_classes):
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])
            # Finally average it and compute AUC
            mean_tpr /= n_classes
            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr
            roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

            ## Plot ROC curves
            plt.figure(figsize=(6, 4))
            plt.plot(fpr["micro"], tpr["micro"],
                     label='micro-average ROC curve (area = {0:0.2f})'
                           ''.format(roc_auc["micro"]), linewidth=3)

            plt.plot(fpr["macro"], tpr["macro"],
                     label='macro-average ROC curve (area = {0:0.2f})'
                           ''.format(roc_auc["macro"]), linewidth=3)

            for i, label in enumerate(class_labels):
                plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                               ''.format(label, roc_auc[i]),
                         linewidth=2, linestyle=':')
        else:
            raise ValueError('Number of classes should be atleast 2 or more')

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.show()

In [104]:
def summary(clf: Estimator, test_labels: List[str], test_data: List[str], model: str) -> str:
    """Print out the performance of a model for 4 metrics"""
    meu = Evaluation()
    predictions = clf.predict(test_data)
    unique_classes = list(set(test_labels))
    print('4 summary metrics for {} are'.format(model))
    meu.get_metrics(true_labels=test_labels, predicted_labels=predictions)

In [98]:
meu = Evaluation()

In [95]:
mnb_predictions  = gs_mnb.predict(test_corpus)
unique_classes = list(set(test_label_names))
model = 'Multinomial Naive Bayes'
print('4 summary metrics for {} are'.format(model))
meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)

4 summary metrics for Multinomial Naive Bayes are
Accuracy: 0.779
Precision: 0.7807
Recall: 0.779
F1 Score: 0.776


In [99]:
meu.display_classification_report(true_labels=test_label_names,
                                  predicted_labels=mnb_predictions,
                                  classes=unique_classes)

                          precision    recall  f1-score   support

         sci.electronics       0.79      0.72      0.75       322
         rec.motorcycles       0.83      0.81      0.82       316
        rec.sport.hockey       0.91      0.91      0.91       288
            misc.forsale       0.84      0.69      0.75       326
  soc.religion.christian       0.67      0.90      0.77       310
      talk.politics.misc       0.64      0.67      0.66       252
      talk.religion.misc       0.61      0.31      0.42       191
comp.sys.ibm.pc.hardware       0.68      0.77      0.72       329
   comp.sys.mac.hardware       0.75      0.72      0.74       296
                 sci.med       0.89      0.88      0.89       336
      talk.politics.guns       0.72      0.79      0.75       305
               sci.space       0.83      0.84      0.83       330
           comp.graphics       0.66      0.78      0.71       295
               rec.autos       0.80      0.83      0.82       293
      rec

In [100]:
svm_predictions = gs_svm.predict(test_corpus)
unique_classes = list(set(test_label_names))
model = 'Support Vector Machines'
print('4 summary metrics for {} are'.format(model))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

4 summary metrics for Support Vector Machines are
Accuracy: 0.7802
Precision: 0.7784
Recall: 0.7802
F1 Score: 0.7779


In [102]:
# metrics performance for each topic
meu.display_classification_report(true_labels=test_label_names,
                                  predicted_labels=svm_predictions,
                                  classes=unique_classes)

                          precision    recall  f1-score   support

         sci.electronics       0.74      0.75      0.74       322
         rec.motorcycles       0.84      0.79      0.82       316
        rec.sport.hockey       0.92      0.92      0.92       288
            misc.forsale       0.78      0.81      0.80       326
  soc.religion.christian       0.74      0.88      0.80       310
      talk.politics.misc       0.67      0.67      0.67       252
      talk.religion.misc       0.54      0.36      0.43       191
comp.sys.ibm.pc.hardware       0.73      0.73      0.73       329
   comp.sys.mac.hardware       0.76      0.75      0.75       296
                 sci.med       0.81      0.88      0.84       336
      talk.politics.guns       0.74      0.75      0.75       305
               sci.space       0.86      0.78      0.82       330
           comp.graphics       0.69      0.75      0.72       295
               rec.autos       0.78      0.81      0.79       293
      rec

In [None]:
summary(clf = gs_lr, test_labels = test_label_names, test_data = test_corpus, model = 'Logistic Regression')