In [8]:
! pip install pymystem3



In [176]:
import os  # для работы с папками и файлами
import collections
import math
import numpy as np
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('russian')
from pymystem3 import Mystem
m = Mystem()

# Компьютерная семантика. Начало

В итоге мы хотим уметь понимать значение слов и значение отрывков текста.

Постепенно до этого дойдём.

Сегодня обсуждаем самые базовые способ представить тексты в числах.

## One-hot encoding

Нам нужно представить слова и тексты в виде чисел, чтобы компьютер мог с ними работать. Нам хочется, чтобы эти представления тексстов содержали в себе семантику, хотим понимать значения.

Простой способ: одно слово - одно число. Но не совсем. Хочется так, чтоб при сложении слов из текста, получалось значение текста, а не рандомное число.

*мама - 1, мыла - 2, раму - 3*

Если записать эти числа так, чтоб везде были нули, кроме слота с порядковым номером слова, получится

*мама - 001, мыла - 010, раму - 100*

Тогда предложение "мама мыла" будет 011.

А "мама мыла раму" будет 111.

In [171]:
from sklearn.feature_extraction.text import CountVectorizer

In [166]:
corpus = [
	'мама мыла раму',
	'Мила мыла куклу и раму',
	'Римма мыла раму и мама мыла раму'
]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)  # вот тут мы векторизовали тексты

In [170]:
X, X.toarray()  # смотрим матрицу

(<3x6 sparse matrix of type '<class 'numpy.int64'>'
 	with 11 stored elements in Compressed Sparse Row format>,
 array([[0, 1, 0, 1, 1, 0],
        [1, 0, 1, 1, 1, 0],
        [0, 1, 0, 2, 2, 1]]))

Sparse matrix - разреженная матрица (матрица, в которой много нулей).

In [173]:
vectorizer.vocabulary_

{'мама': 1, 'мыла': 3, 'раму': 4, 'мила': 2, 'куклу': 0, 'римма': 5}

In [174]:
# чтобы узнать индекс токена в словаре
vectorizer.vocabulary_.get('куклу')

0

In [178]:
# чтобы узнать количественное вхождение каждого слова:
matrix_freq = np.asarray(X.sum(axis=0)).ravel()
final_matrix = np.array([np.array(vectorizer.get_feature_names_out()), matrix_freq])
final_matrix

array([['куклу', 'мама', 'мила', 'мыла', 'раму', 'римма'],
       [1, 2, 1, 4, 4, 1]], dtype=object)

In [168]:
vectorizer.get_feature_names_out()  # так называются "фичи" наших текстов

array(['куклу', 'мама', 'мила', 'мыла', 'раму', 'римма'], dtype=object)

In [169]:
df = pd.DataFrame(X.toarray())
df.columns =list(vectorizer.get_feature_names_out())
df

Unnamed: 0,куклу,мама,мила,мыла,раму,римма
0,0,1,0,1,1,0
1,1,0,1,1,1,0
2,0,1,0,2,2,1


Заметьте!
- Мы здесь верим в композициональность!
- Не учитываем порядок слов.
- Векторы слов рандомные, семантики не содержат.

# Как сравнить несколько текстов

Как понять, какой текст о чём?  Как выделить ключевые слова?

Почему просто взять частотные - не самый лучший вариант?

А если убрать стопслова?

In [2]:
file_list = os.listdir()  # список файлов и папок в директории, где запущена программа
file_list

['.config', '.ipynb_checkpoints', 'oster', 'sample_data']

In [3]:
file_list = os.listdir('./oster')
file_list

['znakomstvo.txt',
 '.ipynb_checkpoints',
 'babushka.txt',
 '38popugajev.txt',
 'lechit.txt',
 'son.txt',
 'letet.txt',
 'idti.txt',
 'perehod.txt',
 'zakon.txt',
 'zaryadka.txt']

In [103]:
name_text = {}
text_lex = {}
for filename in file_list:
    if not filename.startswith('.'):
        with open('./oster/'+filename, encoding='UTF-8') as f:
            text = f.read()
        name_text[filename[:-4]] = text
        lemmas = m.lemmatize(text)
        words = [l for l in lemmas if l.isalpha()]
        text_lex[filename[:-4]] = words  # получили словарь {название текста: список лемм}

In [22]:
list(text_lex.keys())

['znakomstvo',
 'babushka',
 '38popugajev',
 'lechit',
 'son',
 'letet',
 'idti',
 'perehod',
 'zakon',
 'zaryadka']

In [23]:
text_lex['38popugajev'][:20]

['удав',
 'склоняться',
 'над',
 'трава',
 'и',
 'рассматривать',
 'мартышка',
 'очень',
 'осторожно',
 'на',
 'цыпочки',
 'подходить',
 'к',
 'удав',
 'и',
 'тоже',
 'посмотреть',
 'в',
 'трава',
 'ползти']

- сравнить самые частотные слова

In [24]:
for text in text_lex:
    print(text)
    fdist = collections.Counter(text_lex[text])
    print(fdist.most_common(15))

znakomstvo
[('и', 22), ('попугай', 14), ('с', 14), ('слоненок', 13), ('удав', 12), ('мартышка', 12), ('сказать', 11), ('мы', 10), ('ты', 10), ('не', 10), ('а', 9), ('друг', 9), ('я', 7), ('познакомиться', 7), ('как', 6)]
babushka
[('и', 61), ('бабушка', 52), ('мартышка', 47), ('а', 40), ('слоненок', 37), ('удав', 34), ('сказать', 34), ('попугай', 30), ('на', 28), ('я', 27), ('она', 26), ('ты', 26), ('не', 26), ('что', 25), ('в', 24)]
38popugajev
[('удав', 84), ('и', 58), ('мартышка', 58), ('он', 49), ('попугай', 48), ('сказать', 40), ('а', 35), ('не', 35), ('я', 30), ('хвост', 29), ('слоненок', 27), ('в', 26), ('что', 26), ('ты', 24), ('голова', 23)]
lechit
[('не', 71), ('и', 69), ('удав', 63), ('мартышка', 60), ('сказать', 52), ('ты', 50), ('я', 44), ('а', 39), ('слоненок', 38), ('попугай', 38), ('что', 36), ('он', 31), ('это', 31), ('в', 23), ('она', 21)]
son
[('удав', 59), ('и', 54), ('я', 51), ('мартышка', 50), ('попугай', 37), ('не', 36), ('ты', 36), ('сниться', 35), ('слоненок', 

Ну вот, стопслова не убрали. Убираем.

In [27]:
text_lex_nostopw = {}
for filename in text_lex:
    words_nostopw = []
    for word in text_lex[filename]:
        if word not in stopwords:
            words_nostopw.append(word)
    text_lex_nostopw[filename] = words_nostopw

In [28]:
for text in text_lex_nostopw:
    print(text)
    fdist = collections.Counter(text_lex_nostopw[text])
    print(fdist.most_common(15))

znakomstvo
[('попугай', 14), ('слоненок', 13), ('удав', 12), ('мартышка', 12), ('сказать', 11), ('друг', 9), ('познакомиться', 7), ('знакомый', 6), ('встречаться', 6), ('пожимать', 4), ('жаль', 3), ('удовольствие', 3), ('знакомиться', 3), ('это', 3), ('случайно', 3)]
babushka
[('бабушка', 52), ('мартышка', 47), ('слоненок', 37), ('удав', 34), ('сказать', 34), ('попугай', 30), ('банан', 19), ('это', 14), ('очень', 13), ('ждать', 12), ('свой', 12), ('приезжать', 9), ('здравствовать', 8), ('сторона', 8), ('гулять', 8)]
38popugajev
[('удав', 84), ('мартышка', 58), ('попугай', 48), ('сказать', 40), ('хвост', 29), ('слоненок', 27), ('голова', 23), ('спрашивать', 22), ('рост', 22), ('это', 19), ('очень', 17), ('измерять', 15), ('ползти', 14), ('свой', 13), ('знать', 12)]
lechit
[('удав', 63), ('мартышка', 60), ('сказать', 52), ('слоненок', 38), ('попугай', 38), ('это', 31), ('орех', 21), ('думать', 20), ('куча', 20), ('лежать', 19), ('спрашивать', 17), ('очень', 11), ('знать', 11), ('ходить',

- а ещё можно биграммы и триграммы посмотреть (больше совпадений - тексты ближе)

In [29]:
bigr_l = list(nltk.bigrams(text_lex_nostopw['38popugajev']))
bigr_counter = collections.Counter(bigr_l)
bigr_counter.most_common(10)

[(('сказать', 'попугай'), 11),
 (('сказать', 'мартышка'), 10),
 (('сказать', 'удав'), 8),
 (('свой', 'рост'), 8),
 (('спрашивать', 'мартышка'), 7),
 (('слоненок', 'мартышка'), 6),
 (('измерять', 'свой'), 5),
 (('твой', 'рост'), 5),
 (('удав', 'сказать'), 5),
 (('спрашивать', 'слоненок'), 5)]

In [30]:
thgr_l = list(nltk.ngrams(text_lex_nostopw['38popugajev'], 3))
thgr_counter = collections.Counter(thgr_l)
thgr_counter.most_common(10)

[(('измерять', 'свой', 'рост'), 5),
 (('измерять', 'твой', 'рост'), 3),
 (('удав', 'сказать', 'попугай'), 3),
 (('ползти', 'ползти', 'спрашивать'), 2),
 (('ползти', 'спрашивать', 'мартышка'), 2),
 (('удав', 'ползти', 'ползти'), 2),
 (('сказать', 'мартышка', 'это'), 2),
 (('удав', 'принимать', 'решение'), 2),
 (('свой', 'рост', 'просто'), 2),
 (('сказать', 'мартышка', 'складываться'), 2)]

Во всех текстах одни и те же главные герои. Везде частотны слова 'попугай', 'слоненок', 'удав', 'мартышка'. Получается, тексты похожи.

Но нам важно понимать, чем они отличаются.

Точнее будет смотреть на слова, которые в отдельных текстах частотные, а при этом в коллекции документов редкие.

**Есть метод!**

# TF-IDF

- TF — term frequency - частота слова внутри документа
- IDF — inverse document frequency - log(кол-во документов в корпусе/кол-во документов с этим словом)

**TF-IDF = TF * IDF**

Чем больше TF-IDF, тем важнее слово.

\#(логарифм натуральный, по основанию e≈2,72, но можно и другой)

IDF придумала Karen Spärck Jones (в 1972 году).

IDF можно считать немного по-другому. Например, добавляют единицу, чтоб избежать возможнонго деления на ноль: log(кол-во документов в корпусе**+1**/кол-во документов с этим словом**+1**)

In [31]:
import pandas as pd

Давайте сначала на 3х коротких текстах

In [32]:
documentA = 'мама мыла раму'
documentB = 'Мила мыла куклу и раму'
documentC = 'Римма мыла раму и мама мыла раму'

In [33]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsC = documentC.split(' ')

In [34]:
uniqueWords = set(bagOfWordsA + bagOfWordsB + bagOfWordsC)

In [35]:
# Вот все слова, которые встретились в текстах
uniqueWords

{'Мила', 'Римма', 'и', 'куклу', 'мама', 'мыла', 'раму'}

Делаем словарь {слово: кол-во употреблений} для каждого текста.


In [36]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
numOfWordsC = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsC:
    numOfWordsC[word] += 1

In [37]:
# В первом тексте столько раз встретилось каждое из слов
numOfWordsA

{'Римма': 0, 'куклу': 0, 'мыла': 1, 'раму': 1, 'мама': 1, 'и': 0, 'Мила': 0}

In [64]:
pd.DataFrame([numOfWordsA, numOfWordsB, numOfWordsC])

Unnamed: 0,Римма,куклу,мыла,раму,мама,и,Мила
0,0,0,1,1,1,0,0
1,0,1,1,1,0,1,1
2,1,0,2,2,1,1,0


Считаем **TF** (частота слова внутри документа)

In [38]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [39]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfC = computeTF(numOfWordsC, bagOfWordsC)

In [41]:
pd.DataFrame([tfA, tfB, tfC])

Unnamed: 0,Римма,куклу,мыла,раму,мама,и,Мила
0,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.0
1,0.0,0.2,0.2,0.2,0.0,0.2,0.2
2,0.142857,0.0,0.285714,0.285714,0.142857,0.142857,0.0


Считаем **IDF** (log(кол-во документов/кол-во документов с этим словом))

один раз для всех слов

In [151]:
def computeIDF(documents):
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [91]:
idfs = computeIDF([numOfWordsA, numOfWordsB, numOfWordsC])

In [92]:
# Есть слова, которые встретились во многих текстах (значение меньше),
# есть "специфичные", которые встретились в малом кол-ве текстов (значение больше)
idfs

{'Римма': 1.0986122886681098,
 'куклу': 1.0986122886681098,
 'мыла': 0.0,
 'раму': 0.0,
 'мама': 0.4054651081081644,
 'и': 0.4054651081081644,
 'Мила': 1.0986122886681098}

In [152]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [94]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)

In [95]:
print('\n', documentA, '\n', documentB, '\n', documentC)
pd.DataFrame([tfidfA, tfidfB, tfidfC])


 мама мыла раму 
 Мила мыла куклу и раму 
 Римма мыла раму и мама мыла раму


Unnamed: 0,Римма,куклу,мыла,раму,мама,и,Мила
0,0.0,0.0,0.0,0.0,0.135155,0.0,0.0
1,0.0,0.219722,0.0,0.0,0.0,0.081093,0.219722
2,0.156945,0.0,0.0,0.0,0.057924,0.057924,0.0


Чем больше значение, тем важнее слово для текста.

"Рама" встречается во всех трёх - у неё значение везде ноль.

"Кукла" и "Мила" встречаются только во втором - они очень важны для второго текста (у них самые большие значения)

# TfidfVectorizer

Готовый инструмент из библиотеки [sklearn](https://scikit-learn.org/stable/)

In [54]:
corpus = [documentA, documentB, documentC]

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
vect_tfidf = TfidfVectorizer()
mama_tfidf = vect_tfidf.fit_transform(corpus)

In [63]:
mama_tfidf.toarray()

array([[0.        , 0.67325467, 0.        , 0.52284231, 0.52284231,
        0.        ],
       [0.6088451 , 0.        , 0.6088451 , 0.35959372, 0.35959372,
        0.        ],
       [0.        , 0.36384968, 0.        , 0.56512346, 0.56512346,
        0.4784186 ]])

In [57]:
vect_tfidf.get_feature_names_out()

array(['куклу', 'мама', 'мила', 'мыла', 'раму', 'римма'], dtype=object)

In [61]:
print('\n', documentA, '\n', documentB, '\n', documentC)

df = pd.DataFrame(mama_tfidf.toarray())
df.columns =list(vect_tfidf.get_feature_names_out())
df


 мама мыла раму 
 Мила мыла куклу и раму 
 Римма мыла раму и мама мыла раму


Unnamed: 0,куклу,мама,мила,мыла,раму,римма
0,0.0,0.673255,0.0,0.522842,0.522842,0.0
1,0.608845,0.0,0.608845,0.359594,0.359594,0.0
2,0.0,0.36385,0.0,0.565123,0.565123,0.478419


Таблица получилось немного другой, потому что есть несколько способов посчитать idf. Но суть осталась прежней.

In [118]:
# попытка приблизить нашу функцию к TfidfVectorizer
def computeIDF_2(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log((N + 1) / (float(val) + 1)) + 1  # в этой строке
        # дополнительные единицы, которые призваны спасать от деления на ноль
    return idfDict

idfs = computeIDF_2([numOfWordsA, numOfWordsB, numOfWordsC])

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)

print('\n', documentA, '\n', documentB, '\n', documentC)
pd.DataFrame([tfidfA, tfidfB, tfidfC])


 мама мыла раму 
 Мила мыла куклу и раму 
 Римма мыла раму и мама мыла раму


Unnamed: 0,Римма,куклу,мыла,раму,мама,и,Мила
0,0.0,0.0,0.333333,0.333333,0.429227,0.0,0.0
1,0.0,0.338629,0.2,0.2,0.0,0.257536,0.338629
2,0.241878,0.0,0.285714,0.285714,0.183955,0.183955,0.0


#### Возможности sklearn векторизаторов

Теперь любой текст с этими словами можно векторизировать.

In [58]:
vect_tfidf.transform(["Мила мыла раму и мама"]).toarray()

array([[0.        , 0.50410689, 0.66283998, 0.39148397, 0.39148397,
        0.        ]])

Векторайзеры в sklearn имеют три основных метода fit, transform и fit_transform.

`fit` - собирает словарь и статистики по текстам

`transform` - преобразует тексты в векторы, на основе уже собранного словаря

`fit_transform` - делает сразу и первое и второе (быстре чем 1 и 2 по очереди)


## А теперь для наших текстов то же самое


In [179]:
bagOfWords_0 = text_lex['38popugajev']
bagOfWords_1 = text_lex['babushka']
bagOfWords_2 = text_lex['letet']
bagOfWords_3 = text_lex['zaryadka']
bagOfWords_4 = text_lex['zakon']
bagOfWords_5 = text_lex['znakomstvo']
bagOfWords_6 = text_lex['idti']
bagOfWords_7 = text_lex['lechit']
bagOfWords_8 = text_lex['perehod']
bagOfWords_9 = text_lex['son']

bagsOfWords = [bagOfWords_0, bagOfWords_1, bagOfWords_2, bagOfWords_3,
              bagOfWords_4, bagOfWords_5, bagOfWords_6, bagOfWords_7,
              bagOfWords_8, bagOfWords_9]

In [180]:
oster_uniqueW = set(sum(bagsOfWords, []))

In [181]:
numsOfWords = []
for bagOfWords in bagsOfWords:
    numOfWords = dict.fromkeys(oster_uniqueW, 0)
    for word in bagOfWords:
        numOfWords[word] += 1
    numsOfWords.append(numOfWords)

In [182]:
idfs = computeIDF(numsOfWords)

In [183]:
tfidfs = []
for i in range(len(numsOfWords)):
    tfidf = computeTFIDF(computeTF(numsOfWords[i], bagsOfWords[i]), idfs)
    tfidfs.append(tfidf)

df = pd.DataFrame(tfidfs)

In [184]:
df

Unnamed: 0,может,держаться,навешивать,протирать,присниться,всплескивать,побежать,небо,необитаемый,укрепляться,...,терпеть,удовольствие,увидеть,мчаться,аааа,класть,хозяин,гулять,вертеться,бушевать
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001369,0.000716,0.000304,0.0,0.000545,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000808,0.0,0.0,0.0,...,0.0,0.0,0.000343,0.0,0.0,0.0,0.0,0.008641,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002377,0.0,0.0,...,0.0,0.0,0.0,0.002273,0.000905,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000807,0.0,0.001543,...,0.0,0.0,0.000685,0.0,0.000614,0.0,0.0,0.0,0.0,0.0
4,0.003406,0.0,0.0,0.0,0.0,0.0,0.001703,0.003406,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.009406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.002174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000965,0.0,0.0,0.0,0.0,0.015198,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.000866,0.000648,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000493,0.001239,0.0,0.0,0.002477,0.0
8,0.000807,0.001079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001712,0.0,0.0,0.0,0.003087,0.0,0.0,0.0
9,0.001647,0.001101,0.0,0.001575,0.004725,0.001101,0.0,0.0,0.004725,0.0,...,0.0,0.000824,0.000349,0.0,0.0,0.0,0.0,0.0,0.0,0.004725


In [185]:
# Посмотрим на слова с наибольшим tf-idf в "38 попугаев"
collections.Counter(tfidfs[0]).most_common(15)

[('измерять', 0.02053434981861515),
 ('рост', 0.015747563433514026),
 ('ползти', 0.013396034942971109),
 ('принимать', 0.008611736749052855),
 ('половина', 0.008213739927446062),
 ('складываться', 0.006844783272871718),
 ('мерять', 0.006844783272871718),
 ('дергать', 0.005741157832701903),
 ('шагнуть', 0.005475826618297374),
 ('измеряться', 0.005475826618297374),
 ('порываться', 0.005475826618297374),
 ('э', 0.005475826618297374),
 ('глотать', 0.005475826618297374),
 ('целый', 0.004784298193918253),
 ('длинный', 0.004358100983943663)]

In [186]:
# Бабушка
collections.Counter(tfidfs[1]).most_common(15)

[('бабушка', 0.05616830298427732),
 ('банан', 0.015352673343753547),
 ('ждать', 0.012961916073294768),
 ('здравствовать', 0.012362872982518366),
 ('присмотр', 0.009272154736888777),
 ('гулять', 0.008641277382196512),
 ('взрослый', 0.007726795614073979),
 ('внук', 0.007726795614073979),
 ('спелый', 0.007726795614073979),
 ('дорогой', 0.007726795614073979),
 ('воспитание', 0.007726795614073979),
 ('приезжать', 0.007272318952304312),
 ('товарищ', 0.006181436491259183),
 ('обращаться', 0.0046360773684443884),
 ('хвостливый', 0.0046360773684443884)]

In [187]:
# Зарядка
collections.Counter(tfidfs[3]).most_common(15)

[('зарядка', 0.047841915471055915),
 ('упражнение', 0.016976163554245645),
 ('нога', 0.015353396981805548),
 ('падать', 0.012282717585444438),
 ('для', 0.010490379662357352),
 ('вместе', 0.00975609302396706),
 ('мускул', 0.00925972557504308),
 ('спина', 0.00862969390045094),
 ('забывать', 0.007716437979202567),
 ('начинаться', 0.006472270425338205),
 ('хватать', 0.0061731503833620536),
 ('нравиться', 0.0061731503833620536),
 ('лежа', 0.0061731503833620536),
 ('ставить', 0.0056486659720385735),
 ('утро', 0.005393558687781837)]

In [188]:
# Закон тяготения
collections.Counter(tfidfs[4]).most_common(15)

[('закон', 0.03414649036281684),
 ('орех', 0.0194403974230726),
 ('бросать', 0.01628419443418703),
 ('закрываться', 0.01628419443418703),
 ('стукнуть', 0.013027355547349623),
 ('жалко', 0.009105730763417823),
 ('умный', 0.007776158969229039),
 ('природа', 0.006811727322919016),
 ('наоборот', 0.006811727322919016),
 ('трахнуть', 0.0065136777736748116),
 ('действовать', 0.0065136777736748116),
 ('делаться', 0.0065136777736748116),
 ('макушка', 0.0065136777736748116),
 ('затылок', 0.0065136777736748116),
 ('банановый', 0.0065136777736748116)]

То же самое с помощью TfidfVectorizer

In [189]:
oster_corpus = list(name_text.values())  # список списков слов

In [105]:
vect_tfidf = TfidfVectorizer()
oster_tfidf = vect_tfidf.fit_transform(oster_corpus)

In [106]:
df = pd.DataFrame(oster_tfidf.toarray())
df.columns =list(vect_tfidf.get_feature_names_out())
df

Unnamed: 0,ааа,аааа,ага,ай,аккуратно,ананасы,африке,африку,ах,ахнул,...,этом,этому,этот,эту,эх,яблоко,ясно,ящиков,ящичек,ёй
0,0.0,0.0,0.0,0.0,0.0,0.0,0.031585,0.0,0.045216,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.010338,0.0,0.0,0.01741,...,0.0,0.0,0.015457,0.018692,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.010851,0.009745,0.0,0.0,0.0,0.0,0.01641,0.0,0.0,...,0.010851,0.0,0.021854,0.0,0.0,0.0,0.036615,0.0,0.0,0.01641
3,0.013933,0.009213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009213,0.0,0.01237,0.007479,0.0,0.0,0.0,0.013933,0.013933,0.0
4,0.0,0.0,0.010782,0.015436,0.0,0.018158,0.010782,0.0,0.0,0.0,...,0.024013,0.0,0.00806,0.0,0.018158,0.0,0.0,0.0,0.0,0.0
5,0.0,0.016161,0.0,0.0,0.0,0.0,0.0,0.0,0.041554,0.0,...,0.0,0.0,0.0,0.013121,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.011969,0.0,0.0,0.0,0.011969,0.0,0.0,0.0,...,0.0,0.020155,0.008947,0.0,0.0,0.0,0.01499,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.007933,0.02878,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.037433,0.053588,0.031519,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055966,0.01692,0.0,0.031519,0.0,0.0,0.0,0.0
9,0.0,0.01143,0.020529,0.0,0.0,0.0,0.010265,0.0,0.0,0.0,...,0.01143,0.0,0.02302,0.018559,0.0,0.0,0.025712,0.0,0.0,0.0


In [128]:
uniq_words = list(vect_tfidf.get_feature_names_out())

In [190]:
# Посмотрим на слова с наибольшим tf-idf рассказе "38 попугаев"
word_tfidf = dict(zip(uniq_words,
                      list(vect_tfidf.transform([name_text['38popugajev']]).toarray()[0])))
collections.Counter(word_tfidf).most_common(15)

[('удав', 0.3701033412448211),
 ('мартышка', 0.30943066235222744),
 ('рост', 0.244098432348634),
 ('не', 0.21235437612407768),
 ('попугай', 0.18201803667778085),
 ('он', 0.17942958763305333),
 ('сказал', 0.16381623301000278),
 ('что', 0.1577489651207434),
 ('голова', 0.14106364888002004),
 ('хвост', 0.13955634593681926),
 ('слонёнок', 0.13347989356370596),
 ('его', 0.1329108056541136),
 ('измерить', 0.11487301874001601),
 ('порвался', 0.11487301874001601),
 ('хи', 0.11487301874001601)]

In [134]:
# Бабушка
word_tfidf = dict(zip(uniq_words,
                      list(vect_tfidf.transform([name_text['babushka']]).toarray()[0])))
collections.Counter(word_tfidf).most_common(15)

[('бабушка', 0.5179896983102189),
 ('банан', 0.2959623026029398),
 ('мартышка', 0.21884648219347042),
 ('на', 0.18022651474756388),
 ('что', 0.17378985350657944),
 ('не', 0.16735319226559503),
 ('она', 0.1442694804950735),
 ('попугай', 0.1416065473016573),
 ('слонёнок', 0.1416065473016573),
 ('бабушке', 0.12186683048356345),
 ('сказала', 0.11585990233771963),
 ('здравствуйте', 0.1044572832716258),
 ('играть', 0.10359793966204377),
 ('удав', 0.10298657985575078),
 ('ты', 0.0987019681368305)]

In [164]:
# Зарядка
word_tfidf = dict(zip(uniq_words,
                      list(vect_tfidf.transform([name_text['zaryadka']]).toarray()[0])))
collections.Counter(word_tfidf).most_common(15)

[('зарядку', 0.31114529957944087),
 ('мартышка', 0.28759175435995943),
 ('ноги', 0.27919648258543256),
 ('вместе', 0.2155586596282608),
 ('на', 0.19172783623997297),
 ('попугай', 0.1853369083653072),
 ('удав', 0.1853369083653072),
 ('для', 0.16712795240893782),
 ('не', 0.1661641247413099),
 ('слонёнок', 0.1661641247413099),
 ('зарядка', 0.15557264978972044),
 ('делать', 0.1437057730855072),
 ('он', 0.11200077375226612),
 ('каждое', 0.10371509985981361),
 ('мускулы', 0.10371509985981361)]