In [43]:
import numpy as np
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
with open('../data/processed_corpus.pkl', 'rb') as f:
    processed_corpus = pickle.load(f)

## Embedding

In [45]:
tagged_data = [TaggedDocument(words=lemmas, tags=[str(i)]) for i, lemmas in enumerate(processed_corpus)]

In [46]:
model = Doc2Vec(vector_size=100, min_count=5, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [47]:
vectors = [model.infer_vector(document) for document in processed_corpus]
vectors_np = np.array(vectors)

In [48]:
with open('../data/vectors.pkl', 'wb') as f:
    pickle.dump(vectors_np, f)

## TF-IDF

In [49]:
processed_corpus_joined = [' '.join(doc) for doc in processed_corpus]

In [50]:
tfidf_vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus_joined)

In [51]:
tfidf_matrix = tfidf_matrix.toarray()

In [52]:
with open("../data/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)