# Text tokenization and vectorization

In [1]:
import sys
sys.path.append("..")

import json
import pandas as pd

from src.dataset import Dataset
from src.parallel_vectorizer_tokenizer import BatchTokenizer, FastTfIdfVectorizer

## Load processed dataset

In [2]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [3]:
numMaxOpinions = 30000
texts_list = [document["opinions"][i]["text"] for document in data[:numMaxOpinions] for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [4]:
vec = FastTfIdfVectorizer(texts_list[:numMaxOpinions], BatchTokenizer())
vectors = vec.vectors()

In [5]:
vec.increaseWeightImportantWords(vectors, multiplier=3)

'ketamina' not present
'carabine' not present
'afroamerican' not present
'cybercrime' not present


In [6]:
vec.save_vectors_vectorizer(vectors)

In [7]:
print(vectors.shape)

(30000, 149830)


## Load data from npy and pickle files

In [8]:
loaded_vectors, loaded_vec = FastTfIdfVectorizer.load_vectors_vectorizer()

In [9]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())



In [10]:
X.head()

Unnamed: 0,0,00,000,0000,00004,0000613,0000620,0000636,0001,0002,...,üs,üsn,üst,üstatural,üstestlerode,üstor,üw,ƒ,а,⅓
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
len(loaded_vec.vocabulary_)

149830