```
Let's build our bag of words to be used for topic modelling
Accordig to bigARTM a dataset like UCI bag-of-words has a suitable format,
so we would make a bag-of-word vectors out of our own dataset in the same format as the UCI one.

For each text collection, D is the number of documents, W is the
number of words in the vocabulary, and N is the total number of words
in the collection (below, NNZ is the number of nonzero counts in the
bag-of-words). After tokenization and removal of stopwords, the
vocabulary of unique words was truncated by only keeping words that
occurred more than ten times.

These data sets are ideal
for clustering and topic modeling experiments.

For each text collection we provide docword.*.txt (the bag of words
file in sparse format) and vocab.*.txt (the vocab file).

```

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import numpy as np
import json
import random
import string
import re
import nltk, razdel
from nltk.corpus import stopwords
from pymystem3 import Mystem

# Download nltk packages used in this example
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /home/aliak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aliak/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
import bigjson
def query_big_json(article_id):
    with open('data_file.json', 'rb') as data_set:
        data = bigjson.load(data_set)
        corpus_data = data[str(article_id)]
        category = corpus_data['category']
        text = corpus_data['body']
        
    return (article_id, category)

In [18]:
df = pd.read_json('data_file.json')
df = df.transpose()
df = df[2:]

In [19]:
articles = [text for text in df['body'][0:10] ]
random.shuffle(articles)

In [25]:
punctuations = list(string.punctuation)
stop_words_ru = stopwords.words('russian') + punctuations

def preprocess_text(document):
    articles = []
    for document in document:

        stemmer = Mystem()
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        sentences = document.split()
        tokens = [token.text for sentence in sentences 
               for token in razdel.tokenize(sentence)]
        tokens = [stemmer.lemmatize(word)[0] for word in tokens]
        tokens = [word for word in tokens if not word.isdigit() and word not in stop_words_ru]
        tokens = [word for word in tokens if len(word)  > 3]
        text = " ".join(tokens)
        articles.append(text)
    
    return articles

In [26]:
n_features = 1000
n_components = len(set(df.category))
n_top_words = 20 #number of top words to be extracted for a single topic

In [83]:
def count_vectorize(articles):
    
    corpus = preprocess_text(articles)
    
    vectorizer = CountVectorizer(
        max_df=0.95,
        min_df=2, 
        max_features=n_features,
        stop_words=stop_words_ru,
        ngram_range=(1, 1))

    token_count_matrix = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names()
        
    vocab = open("vocab.rbc.txt", "w")
    for feature in features:
        vocab.write(feature+'\n')
    vocab.close()
    
    docword = open("docword.rbc.txt", "w")
    cx = token_count_matrix.tocoo()
    for docID, wordID, wordCount in zip(cx.row, cx.col, cx.data):
        wordID += 1 # making it unity based to suit with bigARTM
        docID += 1
        docword.write(f"{docID} {wordID} {wordCount}\n")
    docword.close()

In [84]:
count_vectorize(articles)