# Text tokenization and vectorization

In [9]:
import sys
sys.path.append("..")

import json
import pandas as pd

from src.dataset import Dataset
from src.parallel_vectorizer_tokenizer import BatchTokenizer, FastTfIdfVectorizer

## Load processed dataset

In [2]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [3]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [4]:
vec = FastTfIdfVectorizer(texts_list[:10], BatchTokenizer())
vectors = vec.vectors()

In [5]:
vec.save_vectors_vectorizer(vectors)

In [6]:
print(vectors.shape)

(10, 4283)


## Load data from npy and pickle files

In [7]:
loaded_vectors, loaded_vec = FastTfIdfVectorizer.load_vectors_vectorizer()

In [10]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [12]:
X.head()

Unnamed: 0,1,10,100,1002,1005,101,102,103,104,1048,...,yes,yet,young,younger,youth,zant,zehnder,zero,áre,éxcluded
0,0.116253,0.284674,0.0,0.0,0.0,0.0,0.0,0.0,0.014888,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.063801,0.234347,0.026258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.019529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.035741,0.015148,0.0,0.0,0.0,0.013578,0.0,0.005049,0.008978,0.0,...,0.005771,0.0,0.013578,0.0,0.006789,0.006789,0.0,0.0,0.0,0.006789
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209243,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.054851,0.0,0.0,0.004105,0.003489,0.0,0.006978,0.003053,0.037996,0.0,...,0.013957,0.003053,0.0,0.008209,0.0,0.0,0.0,0.004105,0.0,0.0
