In [1]:
import pandas as pd
import numpy as np
import nltk

import re #регулярные выражения
import math
from collections import Counter
import requests
import time

alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'

nltk.download('stopwords') # to use stopwords
nltk.download('punkt') # to use word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wequalwo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wequalwo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# служебные функция для расчета памяти, заниаемой dataframe и csr
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")
    
def get_csr_memory_usage(matrix):
    mem = (matrix.data.nbytes + matrix.indptr.nbytes + matrix.indices.nbytes) * BYTES_TO_MB_DIV
    print("Memory usage is " + str(mem) + " MB")

In [3]:
# десмонтсранция идеи токенизации и стоп-слов
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as stopwords
from nltk.tokenize import word_tokenize

example_sent = """Это простое предложение показывает фильтрацию на стоп-слова"""
 
stop_words = set(stopwords.words('russian'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

from nltk.corpus import stopwords
print("Стоп-слова русского языка:")
print(stopwords.words('russian'))

['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'на', 'стоп-слова']
['Это', 'простое', 'предложение', 'показывает', 'фильтрацию', 'стоп-слова']
Стоп-слова русского языка:
['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'мож

## Создадим функции, позволяющие генерировать следующие ошибки в словах:

1.   **Пропуск буквы:** ${резонанс - рзонанс}$
2.   **Дублирование буквы:** ${резонанс - реезонанс}$
3.   **Перестановка букв:** ${резонанс - рзеонанс}$
4.   **Опечатка:** ${резонанс - ркзонанс}$
5.   **Некоторые орфографические ошибки:**  ${резонанс - ризонанс}$


In [4]:
# все ошибки на расстоянии 2 (расстояние Левинштейна)
def distance2(word):
    return {e2 for e1 in distance1(word) if e1 for e2 in distance1(e1)}

# все ошибки на расстоянии 1 (расстояние Левинштейна)
def distance1(word):
    pairs      = splits(word)
    #transposes = [a+b[1]+b[0]+b[2:]  for (a,b) in pairs if len(b)>1]                           # перестановки
    replaces   = [a+c+b[1:]          for (a,b) in pairs if b for c in replaces_set[b[0]] if b] # замены: 
                                                                                                        # опечатки, 
                                                                                                        # пропуски и дублирования букв, 
                                                                                                        # орфографические ошибки

    last_replaces = [word[0:-1] + c for c in replaces_set[word[-1]]]                           # замены в конце слова
    #return set(transposes + replaces + last_replaces)
    return set(replaces + last_replaces)

def splits(word):
    return [(word[:i], word[i:])
            for i in range (len(word)+1)]

# список возможных замен в слове
replaces_set = pd.Series([          
               ['с', 'в', 'у', 'к', 'е', 'п', 'м', 'аа', ''],
               ['ь', 'о', 'л', 'д', 'ю', 'бб', ''],
               ['ч', 'ы', 'ц', 'у', 'к', 'а', 'с', 'вв', ''],
               ['р', 'н', 'ш', 'л', 'о', 'гг', ''],
               ['б', 'л', 'з', 'ж', 'ю', 'дд', ''],
               ['а', 'к', 'н', 'р', 'п', 'ее', 'ё', 'и', ''],
               ['ёё', 'й', 'е', ''],
               ['ю', 'д', 'щ', 'з', 'х', 'э', 'жж', ''],
               ['д', 'щ', 'х', 'э', 'ж', 'зз', 'с', ''],
               ['м', 'а', 'п', 'р', 'т', 'ии', ''],
               ['ц', 'ы', 'ф', 'йй',''],
               ['а', 'в', 'у', 'е', 'п', 'а', 'кк', ''],
               ['ь', 'о', 'г', 'ш', 'щ', 'д', 'б', 'лл', ''],
               ['с', 'а', 'п', 'и', 'мм', ''],
               ['р', 'п', 'е', 'г', 'о', 'нн', ''],
               ['т', 'р', 'н', 'г', 'ш', 'л', 'ь', 'оо', ''],
               ['м', 'а', 'к', 'е', 'н', 'р', 'и', 'пп', ''],
               ['и', 'п', 'е', 'н', 'г', 'о', 'т', 'рр', 'р', ''],
               ['ч', 'в', 'а', 'м', 'сс', ''],
               ['и', 'п', 'р', 'о', 'ь', 'тт', ''],
               ['в', 'ы', 'ц', 'к', 'а', 'уу', ''],
               ['я', 'ч', 'ы', 'ц', 'й', 'фф', ''],
               ['ж', 'з', 'ъ', 'э', 'хх', ''],
               ['ы', 'ф', 'й', 'у', 'в', 'ы', 'цц', ''],
               ['я', 'ф', 'ы', 'в', 'с', 'чч', ''],
               ['л', 'о', 'г', 'щ', 'д', 'шш', 'шь', ''],
               ['л', 'ш', 'з', 'ж', 'д', 'щщ', 'шь', 'ж', ''],
               ['э', 'х', 'ъъ', 'ь', ''],
               ['ч', 'я', 'ф', 'й', 'ц', 'у', 'в', 'ыы', ''],
               ['т', 'о', 'л', 'б', 'ьь', ''],
               ['ж', 'з', 'х', 'ъ', ''],
               ['б', 'д', 'ж', 'юю', ''],
               ['ф', 'ы', 'ч', 'яя', ''],
               ], 
               index = ['а','б','в','г', 'д', 'е', 'ё', 
                            'ж', 'з', 'и', 'й', 'к', 
                            'л', 'м', 'н', 'о', 'п', 
                            'р', 'с', 'т', 'у', 'ф', 
                            'х', 'ц', 'ч', 'ш', 'щ', 
                            'ъ', 'ы', 'ь', 'э', 'ю', 'я'])


**Импортируем текст**

In [5]:
import codecs
with codecs.open('top10000.txt', 'r', encoding = 'utf-8') as file:
# with codecs.open('russian.txt', 'r', encoding = 'windows 1251') as file:
    TEXT = file.read().replace('\n', ' ') # для текста, в котором слова разделены '\n'
    # TEXT = file.read().replace(' ', ' ') # для текста, в котором слова разделены ' '
def tokens(text):
    return re.findall(r'[а-ё]+', text.lower())
list_of_words = tokens(TEXT)

# удалим все слова, короче 3 символов (потому что слова из 2х символов неинформативны):
tmp = [w for w in list_of_words if len(w) > 2]
list_of_words = tmp

# пока ограничимся 200 словами
# del list_of_words[60000:]

# выведем первые 20 слов
print(list_of_words[:20])
print(len(list_of_words))

['что', 'тот', 'быть', 'весь', 'это', 'как', 'она', 'они', 'так', 'сказать', 'этот', 'который', 'может', 'человек', 'один', 'еще', 'такой', 'только', 'себя', 'свое']
8714


**Для каждого слова рассчитаем возможыне ошибки на расстоянии 1 или 2. Затем произведем токенизацию**

In [19]:
# words = []
corpus = [] # все сгенерированные слова
labels = [] # метки для классификации в порядке ошибочных слов



start = time.time()

for item in list_of_words:
    errors = distance1(item)#.union(distance2(item))# ??? как оптимизировать - неясно
    errors = errors.union(item)
    # words.append([item, errors])
    #tmp = [w for w in errors if len(w) > 2]
    #errors = tmp
    corpus += list(errors)
    labels += [item]*len(errors)

end = time.time()
duration = round(end - start, 2)
print("Set generation: " + str(duration) + " secs")
print("    Total size:", len(labels))

Set generation: 24.02 secs
    Total size: 11121024


In [7]:
# токенизация слова на буквы
'''
def _tokenize(word):
    return [a + a for a in word]
'''
'''
def _tokenize(word):
    return re.findall(r'[а-ё]', word.lower())

'''
def _tokenize(word):
    out = []
    for i in range(len(word)):
        out.append(word[i] + str(i))
    return out



_tokenize('резонанс')

['р0', 'е1', 'з2', 'о3', 'н4', 'а5', 'н6', 'с7']

**Векторизация текста посредством *TfidfVectorizer (TF-IDF)***

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer = lambda x: _tokenize(x))

start = time.time()
V = vectorizer.fit_transform(corpus)
end = time.time()
duration = round(end - start, 2)
print("Vectorization: " + str(duration) + " secs")

print("Найденные токены:")
print(vectorizer.get_feature_names_out())

#3.3

Vectorization: 4.06 secs
Найденные токены:
['а0' 'а1' 'а10' 'а11' 'а2' 'а3' 'а4' 'а5' 'а6' 'а7' 'а8' 'а9' 'б0' 'б1'
 'б10' 'б11' 'б12' 'б2' 'б3' 'б4' 'б5' 'б6' 'б7' 'б8' 'б9' 'в0' 'в1' 'в10'
 'в11' 'в12' 'в13' 'в2' 'в3' 'в4' 'в5' 'в6' 'в7' 'в8' 'в9' 'г0' 'г1' 'г10'
 'г11' 'г12' 'г2' 'г3' 'г4' 'г5' 'г6' 'г7' 'г8' 'г9' 'д0' 'д1' 'д2' 'д3'
 'д4' 'д5' 'д6' 'д7' 'д8' 'д9' 'е0' 'е1' 'е10' 'е11' 'е12' 'е2' 'е3' 'е4'
 'е5' 'е6' 'е7' 'е8' 'е9' 'ж0' 'ж1' 'ж2' 'ж3' 'ж4' 'ж5' 'ж6' 'ж7' 'ж8'
 'з0' 'з1' 'з2' 'з3' 'з4' 'з5' 'з6' 'з7' 'и0' 'и1' 'и10' 'и11' 'и2' 'и3'
 'и4' 'и5' 'и6' 'и7' 'и8' 'и9' 'й0' 'й1' 'й10' 'й11' 'й12' 'й13' 'й14'
 'й15' 'й2' 'й3' 'й4' 'й5' 'й6' 'й7' 'й8' 'й9' 'к0' 'к1' 'к10' 'к11' 'к2'
 'к3' 'к4' 'к5' 'к6' 'к7' 'к8' 'к9' 'л0' 'л1' 'л10' 'л11' 'л12' 'л2' 'л3'
 'л4' 'л5' 'л6' 'л7' 'л8' 'л9' 'м0' 'м1' 'м10' 'м2' 'м3' 'м4' 'м5' 'м6'
 'м7' 'м8' 'м9' 'н0' 'н1' 'н10' 'н11' 'н12' 'н13' 'н2' 'н3' 'н4' 'н5' 'н6'
 'н7' 'н8' 'н9' 'о0' 'о1' 'о10' 'о11' 'о12' 'о13' 'о2' 'о3' 'о4' 'о5' 'о6'
 '

***bool* - векторизация:**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_bin = CountVectorizer(binary = True, tokenizer = lambda x: _tokenize(x))
V_bin = vectorizer_bin.fit_transform(corpus)
print("Найденные токены:")
print(vectorizer_bin.get_feature_names_out())
#print(V_bin.shape())
#print(V_bin.toarray())

Найденные токены:
['а0' 'а1' 'а10' 'а11' 'а2' 'а3' 'а4' 'а5' 'а6' 'а7' 'а8' 'а9' 'б0' 'б1'
 'б10' 'б11' 'б12' 'б2' 'б3' 'б4' 'б5' 'б6' 'б7' 'б8' 'б9' 'в0' 'в1' 'в10'
 'в11' 'в12' 'в13' 'в2' 'в3' 'в4' 'в5' 'в6' 'в7' 'в8' 'в9' 'г0' 'г1' 'г10'
 'г11' 'г12' 'г2' 'г3' 'г4' 'г5' 'г6' 'г7' 'г8' 'г9' 'д0' 'д1' 'д2' 'д3'
 'д4' 'д5' 'д6' 'д7' 'д8' 'д9' 'е0' 'е1' 'е10' 'е11' 'е12' 'е2' 'е3' 'е4'
 'е5' 'е6' 'е7' 'е8' 'е9' 'ж0' 'ж1' 'ж2' 'ж3' 'ж4' 'ж5' 'ж6' 'ж7' 'ж8'
 'з0' 'з1' 'з2' 'з3' 'з4' 'з5' 'з6' 'з7' 'и0' 'и1' 'и10' 'и11' 'и2' 'и3'
 'и4' 'и5' 'и6' 'и7' 'и8' 'и9' 'й0' 'й1' 'й10' 'й11' 'й12' 'й13' 'й14'
 'й15' 'й2' 'й3' 'й4' 'й5' 'й6' 'й7' 'й8' 'й9' 'к0' 'к1' 'к10' 'к11' 'к2'
 'к3' 'к4' 'к5' 'к6' 'к7' 'к8' 'к9' 'л0' 'л1' 'л10' 'л11' 'л12' 'л2' 'л3'
 'л4' 'л5' 'л6' 'л7' 'л8' 'л9' 'м0' 'м1' 'м10' 'м2' 'м3' 'м4' 'м5' 'м6'
 'м7' 'м8' 'м9' 'н0' 'н1' 'н10' 'н11' 'н12' 'н13' 'н2' 'н3' 'н4' 'н5' 'н6'
 'н7' 'н8' 'н9' 'о0' 'о1' 'о10' 'о11' 'о12' 'о13' 'о2' 'о3' 'о4' 'о5' 'о6'
 'о7' 'о8' 'о9' 'п0' 'п1' '

***hash* - векторизация:**

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer_hash = HashingVectorizer(n_features = 33, tokenizer = lambda x: _tokenize(x))
V_hash = vectorizer_hash.fit_transform(corpus)
print("Shape:")
print(V_hash.shape)



Shape:
(471144, 33)


In [11]:
# для наглядности создадим dataframe с данными на основе векторизации TF-IDF
df = pd.DataFrame(V.toarray(), columns = vectorizer.get_feature_names_out())

In [12]:
print("Data size:", len(corpus), len(labels))
#print(13573258/len(corpus))
print(corpus[:10])

Data size: 471144 471144
['то', 'ято', 'чтн', 'вто', 'чт', 'чтш', 'ч', 'чоо', 'чтт', 'о']


In [13]:
df['labels'] = labels
df

Unnamed: 0,а0,а1,а10,а11,а2,а3,а4,а5,а6,а7,...,ё11,ё2,ё3,ё4,ё5,ё6,ё7,ё8,ё9,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,что
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471139,0.0,0.0,0.0,0.0,0.0,0.0,0.333669,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
471140,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
471141,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко
471142,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,яблоко


In [36]:
print_memory_usage_of_data_frame(df)
get_csr_memory_usage(V)

Memory usage is 1500.123 MB
Memory usage is 37.705923999999996 MB


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics  import f1_score, accuracy_score, precision_score, recall_score, jaccard_score
from sklearn.metrics import mean_absolute_error

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## Перейдем к обучению модели

**Создадим pipeline для более простого тестирования в дальнейшем**

In [15]:
# варианты моделей:
# DecisionTreeClassifier()

# MultinomialNB()
# KNeighborsClassifier()
# LogisticRegression()

# требует доп настройки
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

## pipline, основанный на TfidfVectorizer и DecisionTreeClassifier()
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline([
    ("vecorizer", TfidfVectorizer(tokenizer = lambda x: _tokenize(x))),
    ("model",     MultinomialNB())
]
)
X = corpus
y = labels

#LogisticRegression(solver = 'sag', random_state = 42)

**обучим модель**

In [16]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
end = time.time()
duration = round(end - start, 2)
print("Train-test split: " + str(duration) + " secs")
start = time.time()
model_pipeline.fit(X_train, y_train)
end = time.time()
duration = round(end - start, 2)
print("Training: " + str(duration) + " secs")

Train-test split: 0.36 secs
Training: 19.26 secs


**Финт ушами: тестируем модель на данных, в которых допустимо расстояние 2**

In [25]:
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size = 0.1, random_state = 42)

**Теперь протестируем обученную модель**

In [26]:
start = time.time()
pred = model_pipeline.predict(X_test)
end = time.time()
duration = round(end - start, 2)
print("Testing: " + str(duration) + " secs")
# print('    mean:', mean_absolute_error(y_test, pred))

Testing: 162.83 secs


In [27]:
print(' accuracy:', accuracy_score(y_test, pred))
print('precision:', precision_score(y_test, pred, average = 'macro'))
print('   recall:', recall_score(y_test, pred, average = 'macro'))
print('  jaccard:', jaccard_score(y_test, pred, average = 'macro'))
print('       f1:', f1_score(y_test, pred, average = 'macro'))

 accuracy: 0.8210813207050066
precision: 0.7819743002229926
   recall: 0.7245866024181618
  jaccard: 0.6122483351702683
       f1: 0.7329004465535275


## Проведем "живой" тест на модели

In [45]:
pins = input()
pins = re.findall(r'[а-ё]+', pins.lower())


for pin in pins:
    if len(pin) > 2:
        print(model_pipeline.predict([pin])[0], end = ' ')
    else:
        print(pin, end = ' ')

In [3]:
#ккуратно акомпoнировала озартно ортикулировала
print('хочет' in labels)

NameError: name 'labels' is not defined

## #попытка обучить большие объемы данных

## bin

In [303]:
def convert_to_sparse_pandas(df, exclude_columns = []):
    df = df.copy()
    exclude_columns = set(exclude_columns)

    for (columnName, columnData) in df.iteritems():
        if columnName in exclude_columns:
            continue
        df[columnName] = pd.arrays.SparseArray(columnData.values, dtype = np.float16)

    return df

df_sparse = convert_to_sparse_pandas(df, exclude_columns = ["label"])
# display(df_sparse.dtypes)
print_memory_usage_of_data_frame(df_sparse)

Memory usage is 182.116 MB


In [105]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print(accuracy_score(y_test, pred))
print(f1_score(y_test, pred, average = 'macro'))

print(mean_absolute_error(y_test, pred))

0.6571622078283824
0.5426987035755111
53.354150671547565


In [304]:
y = df['label']
X = df[df.columns.difference(['label'])]

y_sparse = df_sparse['label']
X_sparse = df_sparse[df_sparse.columns.difference(['label'])]

y_csr = df['label']
X_csr = V

y_binary = df['label']
X_binary = V_bin

print('X:')
print_memory_usage_of_data_frame(X)
print('X_sparse:')
print_memory_usage_of_data_frame(X_sparse)
print('X_csr:')
get_csr_memory_usage(X_csr)
print('X_binary:')
get_csr_memory_usage(X_binary)

X:
Memory usage is 233.391 MB
X_sparse:
Memory usage is 175.044 MB
X_csr:
Memory usage is 68.581552 MB
X_binary:
Memory usage is 68.581552 MB


In [307]:
for i in range(len(models)):
    print(list_of_names[i])
    test(models[i], X_tests_arr[i], Y_tests_arr[i])

Scipy sparse matrix
Testing: 0.32 secs
accuracy: 0.011158744881569125
      f1: 0.014289106223557658
    mean: 188.53618532678777


Scipy sparse matrix binary
Testing: 0.49 secs
accuracy: 0.708187227111282
      f1: 0.5981014691555474
    mean: 45.30782978530869




In [306]:
def test(mod, Xtest, Ytest):
    start = time.time()
    pred = mod.predict(Xtest)
    end = time.time()
    duration = round(end - start, 2)
    print("Testing: " + str(duration) + " secs")
    print('accuracy:', accuracy_score(Ytest, pred))
    print('      f1:', f1_score(Ytest, pred, average = 'macro'))
    print('    mean:', mean_absolute_error(Ytest, pred))
    print("\n")

In [313]:
models = []
list_of_names = []
X_tests_arr = []
Y_tests_arr = []

model = DecisionTreeClassifier()
# model = BernoulliNB()

# model = GaussianNB()
# model = MultinomialNB()
# msodel = GaussianNB()
# model = MultinomialNB()
# model = KNeighborsClassifier()
# model = LogisticRegression()


#     'Pandas dataframe': [X, y],
vector_dict = {
     'Scipy sparse matrix': [X_csr, y_csr],
     'Scipy sparse matrix binary': [X_binary, y_binary]
    }

for key, item in vector_dict.items():
    print(key)
    list_of_names.append(key)
    start = time.time()
    if (key != 'Pandas dataframe'):
        XX = item[0].toarray()
    else:
        XX = X
        
    X_train, X_test, y_train, y_test = train_test_split(XX, y, test_size = 0.2, random_state = 42)
    end = time.time()
    duration = round(end - start, 2)
    print("Train-test split: " + str(duration) + " secs")

    start = time.time()
    model.fit(X_train, y_train)

    end = time.time()
    duration = round(end - start, 2)
    
    models.append(model)
    X_tests_arr.append(X_test)
    Y_tests_arr.append(y_test)
    print("Training: " + str(duration) + " secs")
    print("\n")

Scipy sparse matrix
Train-test split: 0.26 secs


KeyboardInterrupt: 

In [None]:
tokets_list = vectorizer_bin.get_feature_names_out()

def _tokenize2(pin):
    return [a + a for a in pin]

pins = input()
pins = re.findall(r'[а-я]+', pins.lower())

for pin in pins:
    if(len(pin) < 3):
        print(pin, end = ' ')
        continue
    pin_vec = [0]*33
    pin = _tokenize2(pin)
    for i in range(33):
        if tokets_list[i] in pin:
            pin_vec[i] = 1

    predict = models[0].predict([pin_vec])
    print(list_of_words[predict[0]], end = ' ')

In [84]:
from sklearn.pipeline import Pipeline


class OnlinePipeline(Pipeline):
    def partial_fit(self, X, y = None):
        for i, step in enumerate(self.steps):
            name, est = step
            est.partial_fit(X, y)
            if i < len(self.steps) - 1:
                X = est.transform(X)
        return self

model_pipeline = OnlinePipeline([
    ("vecorizer", TfidfVectorizer(tokenizer = lambda x: _tokenize(x))),
    ("model", DecisionTreeClassifier())
]
)

X = corpus
y = labels

In [94]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
end = time.time()
duration = round(end - start, 2)
print("Train-test split: " + str(duration) + " secs")

a = int(len(X_train)/3)
sum_t = 0


div = 100
part = int(len(X_train)/div)

'''for i in range(1, div + 1):
    start = time.time()
    print((i*(part) - part), ":", i*(part))
    model_pipeline.fit(X_train[(i*(part) - part): i*(part)], y_train[(i*(part) - part): i*(part)])
    end = time.time()
    duration = round(end - start, 2)
    sum_t += duration
    if(i % 10 == 1):
        print("Training round " + str(i) + ": " + str(sum_t) + " secs")'''
    

#print("        Training: " + str(sum_t) + " secs")
t = int(len(X_train)/2)
model_pipeline.fit(X_train[:t], y_train[:t])
model_pipeline.fit(X_train[t:], y_train[t:])

Train-test split: 0.1 secs


OnlinePipeline(steps=[('vecorizer',
                       TfidfVectorizer(tokenizer=<function <lambda> at 0x000001C00208BF40>)),
                      ('model', DecisionTreeClassifier())])