# Text tokenization and vectorization

In [1]:
import sys
sys.path.append("..")

import json
import pandas as pd

from src.dataset import Dataset
from src.parallel_vectorizer_tokenizer import BatchTokenizer, FastTfIdfVectorizer

## Load processed dataset

In [2]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [3]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [4]:
vec = FastTfIdfVectorizer(texts_list[:300], BatchTokenizer())
vectors = vec.vectors()

In [5]:
vec.save_vectors_vectorizer(vectors)

In [6]:
print(vectors.shape)

(300, 7302)


## Load data from npy and pickle files

In [7]:
loaded_vectors, loaded_vec = FastTfIdfVectorizer.load_vectors_vectorizer()

In [8]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [9]:
X.head()

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1005,1006,...,yield,york,young,youth,zion,zollar,zoning,zurich,zwick,á
0,0.0,0.098035,0.193625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.051756,0.153331,0.018145,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.034857,0.011915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.014038,0.00986,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.054394,0.0,0.0,0.0,0.0,0.006219,0.0,0.004488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
len(loaded_vec.vocabulary_)

7302