In [1]:
import pandas as pd
import torch
import pickle
import numpy as np
from gensim.models import Word2Vec  

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../data/text.csv')

In [3]:
# Load the tokenized text from the pickle file
with open('../data/tokenized_text.pkl', 'rb') as f:
    tokenized_text = pickle.load(f)

In [4]:
# Using Word2Vec for word embedding
w2v_model = Word2Vec(tokenized_text, vector_size=100, workers=4)

# Get word vector representations for each document and average the vectors in each document.
# Save the indexes of documents for which word vector representations were not found.
empty_doc_indexes = []
document_vectors = []
for index, document in enumerate(tokenized_text):
    doc_vectors = [w2v_model.wv[word] for word in document if word in w2v_model.wv]

    if doc_vectors:
        avg_doc_vector = np.mean(doc_vectors, axis=0)
        document_vectors.append(avg_doc_vector)
    else:
        empty_doc_indexes.append(index)

In [5]:
# Convert labels to list and remove labels corresponding to empty documents
labels_list = df['label'].values.tolist()
for index in sorted(empty_doc_indexes, reverse=True):
    labels_list.pop(index)

In [6]:
# Convert document vectors to numpy array and then to tensor
document_vectors_np = np.array(document_vectors)
document_vectors_tensor = torch.tensor(document_vectors_np)

# Convert labels to tensor
labels_tensor = torch.tensor(labels_list)

In [7]:
# Save document vectors tensor and labels tensor to files
torch.save(document_vectors_tensor, '../data/document_vectors_emb_tensor.pt')
torch.save(labels_tensor, '../data/labels_emb_tensor.pt')