# Text tokenization and vectorization

In [1]:
import sys
sys.path.append("..")

import json
from pprint import pprint
import numpy as np
import pandas as pd
import pickle

from src.dataset import Dataset
from src.tokenizers import SpacyTokenizer
from src.vectorizers import TfIdfVectors

## Load processed dataset

In [2]:
dataset = Dataset(dataset_path="../data/raw/data.jsonl", save_path="../data/processed/processed.jsonl")
data = dataset.load_json()

## Create list of texts to tokenize

In [3]:
texts_list = [document["opinions"][i]["text"] for document in data for i in range(len(document["opinions"]))]

## Vectorize text list with Spacy tokenizer

We then save the vectors and vectorizer to disk

In [4]:
vec = TfIdfVectors(texts_list[:500], SpacyTokenizer())
vectors = vec.vectors()
vec.save_vectors_vectorizer(vectors)

## Load data from npy and pickle files

In [5]:
loaded_vectors, loaded_vec = TfIdfVectors.load_vectors_vectorizer()

In [6]:
X = pd.DataFrame(loaded_vectors.toarray(), columns=loaded_vec.get_feature_names())

In [7]:
X.head()

Unnamed: 0,%,','bodily,+,",.92",-,-'that,-1(e,-ed,-ing,...,•10,•2,•3,•4,•5,•appellant,•hquitable,•recovery,■,❖
0,0.023458,0.0,0.0,0.0,0.0,0.044684,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.017973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.003243,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.018463,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
