# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [1]:
!pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/06/4d/8d2dd5d81afdea2aa790860d5c7e12f80154923ba827e3fa36759f0bf2cd/FuzzyTM-2.0.5-py3-none-any.whl.metadata
  Downloading FuzzyTM-2.0.5-py3-none-any.whl.metadata (7.8 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f0/fe/b899a3d9a18c9a44a35155c79a4c152cb85990ea38ce6ab7ed73e5caa1b9/pyFUME-0.3.1-py3-none-any.whl.metadata
  Downloading pyFUME-0.3.1-py3-none-any.whl.metadata (9.7 kB)
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for simpful from https://files.pythonhosted.org/packages/9d/0e/aebc2fb0b0f481994179b2ee2b8e6bbf0894d971594688c018375e7076ea/simpful-2.12.0-py3-none-any.whl.metadata
  Downloading simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Down

In [2]:
import numpy as np
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import classification_report

### Choose a few categories fro the entire 20 categories

In [36]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [37]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [38]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



### Define a pipeline combining a text feature extractor with a simple classifier

In [39]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(tol=1e-3)),
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [42]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [43]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=-1, verbose=1)

### Start the grid search

In [44]:
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


### Best Score

In [50]:
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.950


### Best Parameter

In [51]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 1)


### Choose the best model

In [53]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [54]:
# Transformers for Word2Vec and Doc2Vec
class Word2VecVectorizer:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        vectors = []
        for doc in X:
            word_vectors = [self.model.wv[word] for word in doc.split() if word in self.model.wv]
            if word_vectors:
                doc_vector = np.mean(word_vectors, axis=0)
                # Normalize the vector to ensure non-negative values
                doc_vector = np.maximum(0, doc_vector)
            else:
                doc_vector = np.zeros(self.model.vector_size)
            vectors.append(doc_vector)
        return np.array(vectors)

class Doc2VecVectorizer:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        vectors = []
        for doc in X:
            vector = self.model.infer_vector(doc.split())
            vectors.append(np.maximum(0, vector))  # Ensure non-negative values
        return np.array(vectors)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=100, window=5, min_count=1, workers=4)

# Train Doc2Vec model
doc2vec_model = Doc2Vec([TaggedDocument(doc.split(), [i]) for i, doc in enumerate(X_train)], vector_size=100, window=5, min_count=1, workers=4)

In [56]:
# Pipelines for different algorithms and feature extractors
pipelines = [
    ('MultinomialNB_CountVectorizer', Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=1e-05))])), 
    ('MultinomialNB_TfidfVectorizer', Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=1e-05))])),
    ('LogisticRegression_CountVectorizer', Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', LogisticRegression(max_iter=20, penalty='l2'))])), 
    ('LogisticRegression_TfidfVectorizer', Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', LogisticRegression(max_iter=20, penalty='l2'))])), 
    ('SVM_CountVectorizer', Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', SVC(kernel='rbf'))])), 
    ('SVM_TfidfVectorizer', Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', SVC(kernel='rbf'))])),
    ('DecisionTree_CountVectorizer', Pipeline([('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', DecisionTreeClassifier())])), 
    ('DecisionTree_TfidfVectorizer', Pipeline([('vect', TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))), ('clf', DecisionTreeClassifier())])),
    ('MultinomialNB_Word2Vec', Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', MultinomialNB(alpha=1e-05))])),
    ('MultinomialNB_Doc2Vec', Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', MultinomialNB(alpha=1e-05))])),
    ('LogisticRegression_Word2Vec', Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', LogisticRegression(max_iter=20, penalty='l2'))])),
    ('LogisticRegression_Doc2Vec', Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', LogisticRegression(max_iter=20, penalty='l2'))])),
    ('SVM_Word2Vec', Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', SVC(kernel='rbf'))])),
    ('SVM_Doc2Vec', Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', SVC(kernel='rbf'))])),
    ('DecisionTree_Word2Vec', Pipeline([('vect', Word2VecVectorizer(word2vec_model)), ('clf', DecisionTreeClassifier())])),
    ('DecisionTree_Doc2Vec', Pipeline([('vect', Doc2VecVectorizer(doc2vec_model)), ('clf', DecisionTreeClassifier())]))
]

In [57]:
# Grid search and benchmark
results = {}
for name, pipeline in pipelines:
    print(f"Evaluating: {name}")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = classification_report(y_test, y_pred)

# Write results to a text file
with open("classification_results.txt", "w") as file:
    for name, result in results.items():
        file.write(f"{name}:\n{result}\n\n")

Evaluating: MultinomialNB_CountVectorizer
Evaluating: MultinomialNB_TfidfVectorizer
Evaluating: LogisticRegression_CountVectorizer


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating: LogisticRegression_TfidfVectorizer
Evaluating: SVM_CountVectorizer
Evaluating: SVM_TfidfVectorizer
Evaluating: DecisionTree_CountVectorizer
Evaluating: DecisionTree_TfidfVectorizer
Evaluating: MultinomialNB_Word2Vec


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating: MultinomialNB_Doc2Vec
Evaluating: LogisticRegression_Word2Vec


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating: LogisticRegression_Doc2Vec


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating: SVM_Word2Vec
Evaluating: SVM_Doc2Vec
Evaluating: DecisionTree_Word2Vec
Evaluating: DecisionTree_Doc2Vec


### Use the model to classify a piece of text

In [60]:
# Fetching data
categories = ['alt.atheism', 'talk.religion.misc']
data_train = fetch_20newsgroups(subset='train', categories=categories)

In [61]:
# Select a sample text
sample_text = data_train.data[0]  # Choose the first document as a sample

In [62]:
# Loading the best-performing model (Logistic Regression with CountVectorizer)
best_model = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=20, penalty='l2')),
])

In [63]:
# Fit the model on the entire training data
best_model.fit(data.data, data.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
# Classify the sample text
predicted_category = best_model.predict([sample_text])[0]
predicted_category_name = data_train.target_names[predicted_category]

In [66]:
print("Predicted category for the sample text:")
print(predicted_category_name)

Predicted category for the sample text:
alt.atheism
