<a href="https://colab.research.google.com/github/tonyw54/ml-2/blob/main/Tony_Waterman_Task0_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create a benchmark analysis with different algorithms and feature extractors.

Dataset: Fetch 20 Newsgroups​

Feature Extractors:

1. CountVectorizer
2. Word2Vec
3. Doc2Vec
4. TfidfTransformer

Algorithms:

* Multinomial Naïve Bayes
* Logistic Regression
* Support Vector Machines
* Decision Trees

Benchmark all the possible above configurations and choose the best algorithm and feature extractor amongst all configurations.

In [23]:
from pprint import pprint
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy  # For preprocessing
import nltk  # For preprocessing
import re

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from gensim.models import Word2Vec
from gensim.models import Doc2Vec

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

# Preprocessing

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

# Adapted from https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(txt)

In [None]:
# Clean up training data
X_train_brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in twenty_train.data)
t = time()
X_train = [cleaning(doc) for doc in nlp.pipe(X_train_brief_cleaning, batch_size=1000)]
print('Time to clean up training data: {} mins'.format(round((time() - t) / 60, 2)))

# Clean up testing data
X_test_brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in twenty_test.data)
t = time()
X_test = [cleaning(doc) for doc in nlp.pipe(X_test_brief_cleaning, batch_size=1000)]
print('Time to clean up test data: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up training data: 4.91 mins
Time to clean up test data: 2.82 mins


# Count Vectorizer

In [None]:
vect = CountVectorizer()
X_train_cv = vect.fit_transform(X_train)
X_test_cv = vect.transform(X_test)

#### Multinomial Naïve Bayes

In [None]:
nb = MultinomialNB()
y_pred = nb.fit(X_train_cv, twenty_train.target).predict(X_test_cv)
nb.score(X_test_cv, twenty_test.target)


0.8037705788635157

#### Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=2000)
y_pred = lr.fit(X_train_cv, twenty_train.target).predict(X_test_cv)
lr.score(X_test_cv, twenty_test.target)

0.8027084439723845

#### Support Vector Machines

In [None]:
svc = SVC()
y_pred = svc.fit(X_train_cv, twenty_train.target).predict(X_test_cv)
svc.score(X_test_cv, twenty_test.target)

0.10143388210302709

#### Decision Trees

In [None]:
dt = DecisionTreeClassifier()
y_pred = dt.fit(X_train_cv, twenty_train.target).predict(X_test_cv)
dt.score(X_test_cv, twenty_test.target)

0.581651619755709

# Word2Vec

In [30]:
X_train_split = [sentence.split() for sentence in X_train]
X_test_split = [sentence.split() for sentence in X_test]

w2v = Word2Vec(sentences=X_train_split, min_count=5, workers=4)

In [35]:
w2v.wv.most_similar('love')


[('bless', 0.8133929371833801),
 ('satan', 0.8093088269233704),
 ('sinner', 0.8065330982208252),
 ('heaven', 0.8026398420333862),
 ('hell', 0.7991043925285339),
 ('praise', 0.7911373972892761),
 ('merciful', 0.7892584800720215),
 ('forgive', 0.7878407835960388),
 ('christ', 0.7860519886016846),
 ('sad', 0.7761499285697937)]