# Import

In [None]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

# Vectorize Corpus

In [None]:
def vectorize_text(text: str, word_vector_dict: dict) -> dict:
    words = text.split()
    vectors = list(map(word_vector_dict.get, words))
    text_dict = dict(zip(words, vectors))
    return text_dict

def vectorize_corpus_with_dots(corpus: list,
                               separator: str,
                               word_vector_dict: dict) -> np.ndarray:
    vectorized_corpus = list()
    for text_index, text in enumerate(tqdm(corpus)):
        sentences = text.split(sep=separator)
        vectorized_text = list()
        for sentence_index, sentence in enumerate(sentences):
            if not sentence:
                continue
            vectorized_sentence = vectorize_text(sentence, word_vector_dict)
            vectorized_text.append({'document_index': text_index,
                                    'sentence_index': sentence_index,
                                    'sentence_text': vectorized_sentence})
        vectorized_corpus.append(vectorized_text)
    return np.array(vectorized_corpus, dtype=object)

In [None]:
corpus = pd.read_csv("/Users/yuvlo/Spot-the-bot/Russian/RuPreprocessedWithSep.csv")
word_vector_dict = json.load(open("/Users/yuvlo/Spot-the-bot/Russian/RuWordVectorDict8.json"))

In [None]:
vectorized_corpus = vectorize_corpus_with_dots(corpus['preprocessed_text'].tolist(),
                                               ' . ',
                                               word_vector_dict)

In [None]:
np.save("RuVectorizedCorpus8", vectorized_corpus)