# Text tokenization and vectorization

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from src.tokenizers import DocumentTokenizer
from src.vectorizers import DocumentVectorizer
from src.dataset import Dataset

## Load processed dataset

In [2]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [3]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [4]:
max_n = 100

dv = DocumentVectorizer(texts_list[:max_n], DocumentTokenizer(), min_max_df=(0.05, 0.95))
vectors = dv.vectors()
len(dv.vectorizer.vocabulary_)

2168

In [5]:
dv.save_vectors_vectorizer(vectors)

## Load data from npy and pickle files

In [6]:
loaded_vectors, loaded_vec = DocumentVectorizer.load_vectors_vectorizer()

In [7]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [8]:
X.head()

Unnamed: 0,%,-,1005(c,104,143,2d,3d,4th,5th,6th,...,xiv,year,yet,york,young,zwick,§,’,’s,„
0,0.02997,0.059772,0.0,0.029363,0.0,0.030998,0.005983,0.0,0.0,0.006569,...,0.0,0.040034,0.0,0.0,0.0,0.0,0.112905,0.010685,0.012584,0.0
1,0.026065,0.0,0.0,0.0,0.0,0.066606,0.0,0.0,0.0,0.0,...,0.0,0.0,0.024943,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.004384,0.0,0.016369,0.016369,0.053366,0.0,0.0,0.0,0.0,...,0.0,0.017435,0.0,0.0,0.014253,0.0,0.0,0.011168,0.026307,0.0
3,0.0,0.0,0.0,0.0,0.0,0.027663,0.0,0.0,0.0,0.0,...,0.099661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.023269,0.0,0.048263,0.0,0.101901,0.024586,0.0,0.0,0.0,...,0.0,0.018507,0.0,0.0,0.008405,0.0,0.0,0.015367,0.025855,0.0
