In [1]:
import gzip, json
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

data = []

for review in parse("Software.json.gz"):
  data.append(review)

In [2]:
# Your list of strings
documents = []
for d in data: 
    strs = []
    if 'summary' in d:
        strs.append(d['summary'])
    if 'reviewText' in d:
        strs.append(d['reviewText'])
    documents.append(" ".join(strs))

In [3]:
import numpy as np
indices_train = np.load('non-text-feature.npz')['indices_train']
train_documents = [documents[i] for i in indices_train]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(train_documents)
bag_of_words = bow_vectorizer.transform(documents)
from scipy import sparse
sparse.save_npz("text-bag_of_words.npz", bag_of_words)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_documents)
tfidf_matrix = tfidf_vectorizer.transform(documents)
from scipy import sparse
sparse.save_npz("text-tfidf.npz", tfidf_matrix)

In [4]:
import fasttext
import fasttext.util
import numpy as np
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm
from functools import lru_cache

# Load the reduced FastText model
ft = fasttext.load_model('cc.en.300.bin')

# Preprocessing and tokenization
processed_docs = [
    word_tokenize(doc.lower().translate(str.maketrans('', '', string.punctuation)))
    for doc in documents
]


@lru_cache(maxsize=10000)
def get_word_vector(word):
    return ft.get_word_vector(word)

# Function to get the vector for a document
def get_doc_vector(doc):
    vectors = np.array([get_word_vector(word) for word in doc])
    return np.mean(vectors, axis=0) if vectors.size else np.zeros(ft.get_dimension())

# Get document vectors
word2vec = np.array([get_doc_vector(doc) for doc in tqdm(processed_docs)])

# Now doc_vectors contains the vector representation of each document

100%|██████████| 459436/459436 [00:15<00:00, 28900.34it/s]


In [5]:
word2vec.shape

(459436, 300)

In [7]:
np.savez_compressed('text-feature_word2vec.npz',
    word2vec=word2vec
)