# Text tokenization and vectorization

In [None]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from src.tokenizers import BatchTokenizer

# from src.tokenizers import DocumentTokenizer
# from src.vectorizers import DocumentVectorizer
from src.dataset import Dataset

## Load processed dataset

In [None]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [None]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Tokenize in batch
The tokenizer processes the documents in a parallel fashion using Spacy pipe method 

In [None]:
bt = BatchTokenizer()
tokens = bt.tokenize(texts_list[:1000])

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [None]:
max_n = 100

dv = DocumentVectorizer(texts_list[:max_n], DocumentTokenizer(), min_max_df=(0.05, 0.95))
vectors = dv.vectors()
len(dv.vectorizer.vocabulary_)

In [None]:
dv.save_vectors_vectorizer(vectors)

## Load data from npy and pickle files

In [None]:
loaded_vectors, loaded_vec = DocumentVectorizer.load_vectors_vectorizer()

In [None]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [None]:
X.head()