In [97]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [98]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


In [99]:
# Load dataset
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



In [100]:
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, size=100, min_count=1):
        self.size = size
        self.min_count = min_count
        self.model = None

    def fit(self, X, y=None):
        sentences = [word_tokenize(doc) for doc in X]
        self.model = Word2Vec(sentences, size=self.size, min_count=self.min_count)
        return self

    def transform(self, X):
        return [self.model.wv[doc] for doc in X]

class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, min_count=1, epochs=1):
        self.vector_size = vector_size
        self.min_count = min_count
        self.epochs = epochs
        self.model = None

    def fit(self, X, y=None):
        tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X)]
        self.model = Doc2Vec(tagged_data, vector_size=self.vector_size, min_count=self.min_count, epochs=self.epochs)
        return self

    def transform(self, X):
        return [self.model.infer_vector(word_tokenize(doc)) for doc in X]

In [101]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

In [104]:
# Define the parameters for GridSearchCV
parameters = {
    'vect': [CountVectorizer(), TfidfVectorizer(), Word2VecTransformer(), Doc2VecTransformer()],
    'clf': [MultinomialNB(), LogisticRegression(), SVC(), DecisionTreeClassifier()],
}

In [105]:
# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yatch\poetry-demo\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yatch\poetry-demo\.venv\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yatch\poetry-demo\.venv\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yatc

In [108]:
# Get the results of the grid search
cv_results = grid_search.cv_results_

In [109]:
# Write the results to a .txt file in a tabular format
with open('grid_search_results.txt', 'w') as f:
    # Write the column names
    f.write("\t".join(cv_results.keys()) + "\n")
    # Write the rows
    for row in zip(*cv_results.values()):
        f.write("\t".join(map(str, row)) + "\n")

In [110]:
# Print the best parameters
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.943
Best parameters set:
	clf: SVC()
	vect: TfidfVectorizer()
