# Text tokenization and vectorization

In [2]:
import sys
sys.path.append("..")

import json
import pandas as pd

from src.dataset import Dataset
from src.parallel_vectorizer_tokenizer import BatchTokenizer, FastTfIdfVectorizer

## Load processed dataset

In [3]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [4]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [5]:
vec = FastTfIdfVectorizer(texts_list[:20], BatchTokenizer())
vectors = vec.vectors()
vec.increaseWeightImportantWords(vectors)

[['cannabis'], ['weed'], ['cocaine'], ['methamphetamine'], ['drugs'], ['marijuana'], ['mdma'], ['lsd'], ['ketamina'], ['heroin'], ['fentanyl'], ['narcotics'], ['weapons'], ['gun'], ['knife'], ['weapon'], ['firearm'], ['rifle'], ['carabine'], ['shotgun'], ['assaults'], ['sword'], ['blunt'], ['investigation'], ['gang'], ['mafia'], ['serial'], ['killer'], ['rape'], ['thefts'], ['recidivism'], ['arrest'], ['ethnicity'], ['caucasian'], ['afroamerican'], ['hispanic'], ['robbery'], ['cybercrime']]


KeyError: 'weed'

In [None]:
vec.save_vectors_vectorizer(vectors)

In [None]:
print(vectors.shape)

## Load data from npy and pickle files

In [None]:
loaded_vectors, loaded_vec = FastTfIdfVectorizer.load_vectors_vectorizer()

In [None]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [None]:
X.head()

In [None]:
len(loaded_vec.vocabulary_)