In [13]:
import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import random

In [27]:
# Load the train and test dataset from Sklearn in random order without the 
# headers, footers, and quotes
dataset_train = fetch_20newsgroups(subset='train', remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)
dataset_test = fetch_20newsgroups(subset='test', remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)

In [28]:
# Perform preprocessing on the train and test set
X_train = []
for x in dataset_train.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_train.append(x)

y_train = []
for y in dataset_train.target:
    y_train.append(y)

X_test = []
for x in dataset_test.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_test.append(x)

y_test = []
for y in dataset_test.target:
    y_test.append(y)

In [29]:
# Use the sklearn CountVectorizer to create a bag-of-words encoding of the
# input. By default, the CountVectorizer will lowercase the input
# strings. We manually specify that we only want the 10,000 most frequently
# used words to appear in the output.
cv = CountVectorizer(max_features = 10000)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [30]:
# Use the sklearn TfidfTransformer to compute the Tf-idf scores for evaluation.
tf = TfidfTransformer()
X_train_tf = tf.fit_transform(X_train_cv)
X_test_tf = tf.transform(X_test_cv)

In [31]:
# Use the grid search to find the optimal hyperparameters for Logistic
# Regression. We manually set max_iter to 1000 to ensure the logistic regressor
# converges.
lr_clf = LogisticRegression(max_iter = 1000)

lr_grid = {
    'C' : (50, 40, 30, 20, 10, 5, 1, 0.5, 0.1, 0.05, 0.01)
}

lr_grid_search = GridSearchCV(estimator = lr_clf, param_grid = lr_grid, cv = 2)
lr_grid_search.fit(X_train_tf, y_train)
print(lr_grid_search.best_params_) # Output: {'C': 10}

{'C': 10}


In [32]:
# Try the Logistic Regression with the optimal hyperparameters on the 
# bag-of-words encoded inputs. We should get an accuracy of approximately
# 0.65 for Logistic Regression. 
lr_model = LogisticRegression(C = 10, max_iter = 1000)
lr_model.fit(X_train_tf, y_train)
lr_pred = lr_model.predict(X_test_tf)
print(metrics.classification_report(y_test, lr_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.49      0.47      0.48       319
           comp.graphics       0.61      0.66      0.63       389
 comp.os.ms-windows.misc       0.60      0.59      0.59       394
comp.sys.ibm.pc.hardware       0.59      0.60      0.59       392
   comp.sys.mac.hardware       0.70      0.64      0.67       385
          comp.windows.x       0.80      0.65      0.72       395
            misc.forsale       0.75      0.77      0.76       390
               rec.autos       0.68      0.67      0.67       396
         rec.motorcycles       0.73      0.73      0.73       398
      rec.sport.baseball       0.51      0.79      0.62       397
        rec.sport.hockey       0.87      0.84      0.86       399
               sci.crypt       0.81      0.67      0.74       396
         sci.electronics       0.54      0.54      0.54       393
                 sci.med       0.74      0.71      0.72       396
         

In [33]:
# Use the grid search to find the optimal hyperparameters for MultinomialNB.
mnb_clf = MultinomialNB()

mnb_grid = {
    'alpha': (10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001)
}

mnb_grid_search = GridSearchCV(estimator = mnb_clf, param_grid = mnb_grid, cv = 2)
mnb_grid_search.fit(X_train_tf, y_train)
print(mnb_grid_search.best_params_) # Output: {'alpha': 0.05}

{'alpha': 0.05}


In [34]:
# Try the MultinomialNB with the optimal hyperparameters on the bag-of-words
# encoded inputs. We should get an accuracy of approximately 0.68 for
# MultinomialNB. 
mnb_model = MultinomialNB(alpha = 0.05)
mnb_model.fit(X_train_tf, y_train)
mnb_pred = mnb_model.predict(X_test_tf)
print(metrics.classification_report(y_test, mnb_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.47      0.44      0.46       319
           comp.graphics       0.60      0.70      0.65       389
 comp.os.ms-windows.misc       0.64      0.56      0.60       394
comp.sys.ibm.pc.hardware       0.59      0.66      0.62       392
   comp.sys.mac.hardware       0.69      0.63      0.66       385
          comp.windows.x       0.79      0.74      0.76       395
            misc.forsale       0.80      0.75      0.77       390
               rec.autos       0.73      0.70      0.72       396
         rec.motorcycles       0.73      0.73      0.73       398
      rec.sport.baseball       0.88      0.79      0.83       397
        rec.sport.hockey       0.58      0.90      0.71       399
               sci.crypt       0.79      0.71      0.75       396
         sci.electronics       0.64      0.54      0.59       393
                 sci.med       0.82      0.76      0.79       396
         

Using dataset that is not been process by the TfidfTransformer

In [36]:
# Use the grid search to find the optimal hyperparameters for Logistic
# Regression with dataset that is not been process by the TfidfTransformer.

lr_clf = LogisticRegression(max_iter = 1000)

lr_grid = {
    'C' : (50, 40, 30, 20, 10, 5, 1, 0.5, 0.1, 0.05, 0.01)
}

lr_grid_search = GridSearchCV(estimator = lr_clf, param_grid = lr_grid, cv = 2)
lr_grid_search.fit(X_train_cv, y_train)
print(lr_grid_search.best_params_) # Output: {'C': 0.1}

{'C': 0.1}


In [37]:
# We should get an accuracy of approximately 0.59. 
lr_model = LogisticRegression(C = 0.1,max_iter = 1000)
lr_model.fit(X_train_cv, y_train)
lr_pred = lr_model.predict(X_test_cv)
print(metrics.classification_report(y_test, lr_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.41      0.40      0.41       319
           comp.graphics       0.55      0.62      0.59       389
 comp.os.ms-windows.misc       0.61      0.55      0.58       394
comp.sys.ibm.pc.hardware       0.59      0.53      0.56       392
   comp.sys.mac.hardware       0.65      0.59      0.62       385
          comp.windows.x       0.76      0.60      0.67       395
            misc.forsale       0.72      0.77      0.74       390
               rec.autos       0.58      0.61      0.60       396
         rec.motorcycles       0.56      0.67      0.61       398
      rec.sport.baseball       0.39      0.74      0.51       397
        rec.sport.hockey       0.83      0.79      0.81       399
               sci.crypt       0.79      0.59      0.67       396
         sci.electronics       0.48      0.50      0.49       393
                 sci.med       0.67      0.59      0.63       396
         

In [38]:
# Use the grid search to find the optimal hyperparameters for MultinomialNB with
# dataset that is not been process by the TfidfTransformer.
mnb_clf = MultinomialNB()

mnb_grid = {
    'alpha': (10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001)
}

mnb_grid_search = GridSearchCV(estimator = mnb_clf, param_grid = mnb_grid, cv = 2)
mnb_grid_search.fit(X_train_cv, y_train)
print(mnb_grid_search.best_params_) # Output: {'alpha': 0.05}

{'alpha': 0.05}


In [39]:
# We should get an accuracy of approximately 0.61. 
mnb_model = MultinomialNB(alpha = 0.05)
mnb_model.fit(X_train_cv, y_train)
mnb_pred = mnb_model.predict(X_test_cv)
print(metrics.classification_report(y_test, mnb_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.41      0.51      0.45       319
           comp.graphics       0.50      0.69      0.58       389
 comp.os.ms-windows.misc       0.33      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.48      0.66      0.56       392
   comp.sys.mac.hardware       0.53      0.64      0.58       385
          comp.windows.x       0.75      0.65      0.70       395
            misc.forsale       0.82      0.73      0.77       390
               rec.autos       0.65      0.68      0.67       396
         rec.motorcycles       0.64      0.73      0.68       398
      rec.sport.baseball       0.82      0.76      0.79       397
        rec.sport.hockey       0.57      0.78      0.66       399
               sci.crypt       0.76      0.69      0.72       396
         sci.electronics       0.60      0.52      0.56       393
                 sci.med       0.77      0.72      0.75       396
         

Using dataset with 10 ranodom classes

In [96]:
# Picking 10 random classes for text classification
category_list = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc']

category = random.choices(category_list, k = 10)
print(category)

['sci.electronics', 'soc.religion.christian', 'rec.sport.hockey', 'talk.religion.misc', 'comp.sys.ibm.pc.hardware', 'alt.atheism', 'misc.forsale', 'talk.politics.misc', 'rec.sport.baseball', 'rec.motorcycles']


In [97]:
# Load the train and test dataset with the 10 random classes from Sklearn in
# random order without the headers, footers, and quotes
dataset_train = fetch_20newsgroups(subset='train', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)
dataset_test = fetch_20newsgroups(subset='test', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)

In [98]:
# Perform preprocessing on the train and test set
X_train = []
for x in dataset_train.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_train.append(x)

y_train = []
for y in dataset_train.target:
    y_train.append(y)

X_test = []
for x in dataset_test.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_test.append(x)

y_test = []
for y in dataset_test.target:
    y_test.append(y)

In [99]:
cv = CountVectorizer(max_features=10000)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [100]:
tf = TfidfTransformer()
X_train_tf = tf.fit_transform(X_train_cv)
X_test_tf = tf.transform(X_test_cv)

In [101]:
lr_clf = LogisticRegression(max_iter = 1000)

lr_grid = {
    'C' : (50, 40, 30, 20, 10, 5, 1, 0.5, 0.1, 0.05, 0.01)
}

lr_grid_search = GridSearchCV(estimator = lr_clf, param_grid = lr_grid, cv = 2)
lr_grid_search.fit(X_train_tf, y_train)
print(lr_grid_search.best_params_) # Output: {'C': 10}

{'C': 10}


In [102]:
# We should get an accuracy of approximately 0.72. 
lr_model = LogisticRegression(C = 10,max_iter = 1000)
lr_model.fit(X_train_tf, y_train)
lr_pred = lr_model.predict(X_test_tf)
print(metrics.classification_report(y_test, lr_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.58      0.51      0.54       319
comp.sys.ibm.pc.hardware       0.79      0.81      0.80       392
            misc.forsale       0.85      0.82      0.83       390
         rec.motorcycles       0.80      0.80      0.80       398
      rec.sport.baseball       0.68      0.81      0.74       397
        rec.sport.hockey       0.89      0.84      0.86       399
         sci.electronics       0.72      0.70      0.71       393
  soc.religion.christian       0.69      0.78      0.73       398
      talk.politics.misc       0.71      0.60      0.65       310
      talk.religion.misc       0.41      0.39      0.40       251

                accuracy                           0.72      3647
               macro avg       0.71      0.71      0.71      3647
            weighted avg       0.73      0.72      0.72      3647



In [103]:
mnb_clf = MultinomialNB()

mnb_grid = {
    'alpha': (10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001)
}

mnb_grid_search = GridSearchCV(estimator = mnb_clf, param_grid = mnb_grid, cv = 2)
mnb_grid_search.fit(X_train_tf, y_train)
print(mnb_grid_search.best_params_) # Output: {'alpha': 0.01}

{'alpha': 0.01}


In [104]:
# We should get an accuracy of approximately 0.74. 
mnb_model = MultinomialNB(alpha = 0.01)
mnb_model.fit(X_train_tf, y_train)
mnb_pred = mnb_model.predict(X_test_tf)
print(metrics.classification_report(y_test, mnb_pred, target_names=dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.50      0.56       319
comp.sys.ibm.pc.hardware       0.77      0.85      0.81       392
            misc.forsale       0.87      0.79      0.83       390
         rec.motorcycles       0.83      0.81      0.82       398
      rec.sport.baseball       0.90      0.81      0.86       397
        rec.sport.hockey       0.75      0.93      0.83       399
         sci.electronics       0.75      0.69      0.72       393
  soc.religion.christian       0.61      0.86      0.71       398
      talk.politics.misc       0.72      0.61      0.66       310
      talk.religion.misc       0.42      0.30      0.35       251

                accuracy                           0.74      3647
               macro avg       0.73      0.72      0.71      3647
            weighted avg       0.74      0.74      0.73      3647



Using dataset with 5 ranodom classes

In [120]:
# Picking 5 random classes for text classification
category_list = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc']

category = random.choices(category_list, k = 5)
print(category)

['alt.atheism', 'comp.sys.ibm.pc.hardware', 'rec.sport.hockey', 'misc.forsale', 'sci.crypt']


In [121]:
# Load the train and test dataset with the 5 random classes from Sklearn in
# random order without the headers, footers, and quotes
dataset_train = fetch_20newsgroups(subset='train', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)
dataset_test = fetch_20newsgroups(subset='test', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)

In [122]:
# Perform preprocessing on the train and test set
X_train = []
for x in dataset_train.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_train.append(x)

y_train = []
for y in dataset_train.target:
    y_train.append(y)

X_test = []
for x in dataset_test.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_test.append(x)

y_test = []
for y in dataset_test.target:
    y_test.append(y)

In [123]:
cv = CountVectorizer(max_features=10000)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [124]:
tf = TfidfTransformer()
X_train_tf = tf.fit_transform(X_train_cv)
X_test_tf = tf.transform(X_test_cv)

In [125]:
lr_clf = LogisticRegression(max_iter = 1000)

lr_grid = {
    'C' : (50, 40, 30, 20, 10, 5, 1, 0.5, 0.1, 0.05, 0.01)
}

lr_grid_search = GridSearchCV(estimator = lr_clf, param_grid = lr_grid, cv = 2)
lr_grid_search.fit(X_train_tf, y_train)
print(lr_grid_search.best_params_) # Output: {'C': 20}

{'C': 20}


In [126]:
# We should get an accuracy of approximately 0.87. 
lr_model = LogisticRegression(C = 20,max_iter = 1000)
lr_model.fit(X_train_tf, y_train)
lr_pred = lr_model.predict(X_test_tf)
print(metrics.classification_report(y_test, lr_pred, target_names = dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.89      0.85      0.87       319
comp.sys.ibm.pc.hardware       0.86      0.90      0.88       392
            misc.forsale       0.88      0.84      0.86       390
        rec.sport.hockey       0.83      0.92      0.87       399
               sci.crypt       0.90      0.83      0.86       396

                accuracy                           0.87      1896
               macro avg       0.87      0.87      0.87      1896
            weighted avg       0.87      0.87      0.87      1896



In [127]:
mnb_clf = MultinomialNB()

mnb_grid = {
    'alpha': (10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001)
}

mnb_grid_search = GridSearchCV(estimator = mnb_clf, param_grid = mnb_grid, cv = 2)
mnb_grid_search.fit(X_train_tf, y_train)
print(mnb_grid_search.best_params_) # Output: {'alpha': 0.05}

{'alpha': 0.05}


In [128]:
# We should get an accuracy of approximately 0.89. 
mnb_model = MultinomialNB(alpha = 0.05)
mnb_model.fit(X_train_tf, y_train)
mnb_pred = mnb_model.predict(X_test_tf)
print(metrics.classification_report(y_test, mnb_pred, target_names=dataset_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.85      0.88       319
comp.sys.ibm.pc.hardware       0.86      0.93      0.89       392
            misc.forsale       0.93      0.84      0.88       390
        rec.sport.hockey       0.85      0.96      0.90       399
               sci.crypt       0.90      0.86      0.88       396

                accuracy                           0.89      1896
               macro avg       0.89      0.89      0.89      1896
            weighted avg       0.89      0.89      0.89      1896



Using dataset with 3 ranodom classes

In [163]:
# Picking 3 random classes for text classification
category_list = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc']

category = random.choices(category_list, k = 3)
print(category)

['rec.sport.hockey', 'talk.politics.misc', 'sci.space']


In [164]:
# Load the train and test dataset with the 3 random classes from Sklearn in
# random order without the headers, footers, and quotes
dataset_train = fetch_20newsgroups(subset='train', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)
dataset_test = fetch_20newsgroups(subset='test', categories = category, remove=(['headers', 'footers', 'quotes']), shuffle = True, random_state=100)

In [165]:
# Perform preprocessing on the train and test set
X_train = []
for x in dataset_train.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_train.append(x)

y_train = []
for y in dataset_train.target:
    y_train.append(y)

X_test = []
for x in dataset_test.data:
    x = re.sub('[^\w]|_', ' ', str(x))
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    X_test.append(x)

y_test = []
for y in dataset_test.target:
    y_test.append(y)

In [166]:
cv = CountVectorizer(max_features=10000)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [167]:
tf = TfidfTransformer()
X_train_tf = tf.fit_transform(X_train_cv)
X_test_tf = tf.transform(X_test_cv)

In [168]:
lr_clf = LogisticRegression(max_iter = 1000)

lr_grid = {
    'C' : (50, 40, 30, 20, 10, 5, 1, 0.5, 0.1, 0.05, 0.01)
}

lr_grid_search = GridSearchCV(estimator = lr_clf, param_grid = lr_grid, cv = 2)
lr_grid_search.fit(X_train_tf, y_train)
print(lr_grid_search.best_params_) # Output: {'C': 20}

{'C': 20}


In [169]:
# We should get an accuracy of approximately 0.88. 
lr_model = LogisticRegression(C = 20,max_iter = 1000)
lr_model.fit(X_train_tf, y_train)
lr_pred = lr_model.predict(X_test_tf)
print(metrics.classification_report(y_test, lr_pred, target_names = dataset_train.target_names))

                    precision    recall  f1-score   support

  rec.sport.hockey       0.94      0.90      0.92       399
         sci.space       0.85      0.91      0.88       394
talk.politics.misc       0.86      0.83      0.84       310

          accuracy                           0.88      1103
         macro avg       0.88      0.88      0.88      1103
      weighted avg       0.89      0.88      0.89      1103



In [170]:
mnb_clf = MultinomialNB()

mnb_grid = {
    'alpha': (10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001)
}

mnb_grid_search = GridSearchCV(estimator = mnb_clf, param_grid = mnb_grid, cv = 2)
mnb_grid_search.fit(X_train_tf, y_train)
print(mnb_grid_search.best_params_) # Output: {'alpha': 0.005}

{'alpha': 0.005}


In [171]:
# We should get an accuracy of approximately 0.90. 
mnb_model = MultinomialNB(alpha = 0.005)
mnb_model.fit(X_train_tf, y_train)
mnb_pred = mnb_model.predict(X_test_tf)
print(metrics.classification_report(y_test, mnb_pred, target_names=dataset_train.target_names))

                    precision    recall  f1-score   support

  rec.sport.hockey       0.92      0.95      0.94       399
         sci.space       0.91      0.88      0.89       394
talk.politics.misc       0.87      0.86      0.86       310

          accuracy                           0.90      1103
         macro avg       0.90      0.90      0.90      1103
      weighted avg       0.90      0.90      0.90      1103

