In [1]:
import numpy as np
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the processed corpus from the pickle file
with open('../data/processed_corpus.pkl', 'rb') as f:
    processed_corpus = pickle.load(f)

## Embedding

In [3]:
# Create tagged data for Doc2Vec training
tagged_data = [TaggedDocument(words=lemmas, tags=[str(i)]) for i, lemmas in enumerate(processed_corpus)]

In [4]:
# Initialize and train Doc2Vec model
model = Doc2Vec(vector_size=100, min_count=5, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [5]:
# Generate vectors for documents using the trained Doc2Vec model
vectors = [model.infer_vector(document) for document in processed_corpus]
# Convert vectors to numpy array
vectors_np = np.array(vectors)

In [6]:
# Save the vectors to a pickle file
with open('../data/vectors.pkl', 'wb') as f:
    pickle.dump(vectors_np, f)

## TF-IDF

In [7]:
# Join the processed corpus into a single string for each document
processed_corpus_joined = [' '.join(doc) for doc in processed_corpus]

In [8]:
# Initialize the TF-IDF vectorizer with specified parameters
tfidf_vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))

# Fit the vectorizer to the processed corpus and transform it into TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus_joined)

In [9]:
# Convert the TF-IDF matrix to a dense NumPy array
tfidf_matrix = tfidf_matrix.toarray()

In [10]:
# Save the TF-IDF matrix to a pickle file
with open("../data/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)