In [81]:
!uv pip install navec pymorphy2 pandas

[2mAudited [1m3 packages[0m [2min 12ms[0m[0m


In [82]:
from navec import Navec
import pandas as pd
import pymorphy2

# существительные, нам понадобиться только колонка с начальной формой – bare
# https://github.com/Badestrand/russian-dictionary
nouns = pd.read_csv('nouns.csv', sep='\t')[['bare']]
# переименуем колонку для наглядности
nouns = nouns.rename(columns={"bare": "word"})

# эмбеддинги
# https://github.com/natasha/navec
navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')

# морфологический анализ
morph = pymorphy2.MorphAnalyzer()

In [83]:
def is_proper_noun(word):
    parsed = morph.parse(word)[0]
    # есть в датасете эмбеддингов
    if (word not in navec):
        return False
    # фильтруем географические названия
    if ('Geox' in parsed.tag):
        return False
    # фильтруем имена
    if ('Name' in parsed.tag):
        return False
    # фильтруем фамилии
    if ('Surn' in parsed.tag):
        return False
    # убеждаемся, что точно существительное
    return 'NOUN' == parsed.tag.POS

nouns = nouns[nouns['word'].apply(is_proper_noun)]
nouns

Unnamed: 0,word
0,человек
1,год
2,время
3,рука
5,дело
...,...
26975,обеспокоенность
26976,автозак
26978,туроператор
26979,валенок


In [84]:
def to_vec(word):
    return navec[word]

nouns['embeddings'] = nouns['word'].apply(to_vec)
nouns

Unnamed: 0,word,embeddings
0,человек,"[-0.020386357, -0.4304574, -0.31297466, 0.0747..."
1,год,"[0.070464715, -0.86796796, -0.3054875, 0.00319..."
2,время,"[-0.020386357, -0.4304574, -0.31297466, -0.035..."
3,рука,"[-0.2938254, -0.2968786, -0.025342526, -0.0594..."
5,дело,"[-0.28339294, 0.14809653, -0.08746451, 0.42004..."
...,...,...
26975,обеспокоенность,"[0.6472811, 0.3915942, 0.08539089, 0.87062055,..."
26976,автозак,"[0.04365146, 0.4382294, -0.04026265, -0.307448..."
26978,туроператор,"[0.33099094, 0.19368611, 0.82854134, -0.494124..."
26979,валенок,"[-0.44697946, 0.33564645, 0.11986674, 0.037741..."


In [85]:
# если хочется выбрать конкретное
# secret_word = nouns[nouns['word'] == 'завтрак']
secret_word = nouns.sample(n=1)
secret_word

Unnamed: 0,word,embeddings
12713,туника,"[0.3229023, 0.41563722, 0.6724296, 0.25873458,..."


In [86]:
import math

def cosine_similarity(vec1, vec2):
    dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
    magnitude_vec1 = math.sqrt(sum(v1**2 for v1 in vec1))
    magnitude_vec2 = math.sqrt(sum(v2**2 for v2 in vec2))

    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
        return 0

    return dot_product / (magnitude_vec1 * magnitude_vec2)

secret_word_vec = secret_word['embeddings'].values[0]
nouns['cosine_similarity'] = nouns['embeddings'].apply(lambda emb: cosine_similarity(secret_word_vec, emb))
nouns = nouns.sort_values(by='cosine_similarity', ascending=False).reset_index(drop=True)
nouns

Unnamed: 0,word,embeddings,cosine_similarity
0,туника,"[0.3229023, 0.41563722, 0.6724296, 0.25873458,...",1.000000
1,туника,"[0.3229023, 0.41563722, 0.6724296, 0.25873458,...",1.000000
2,накидка,"[0.3229023, 0.41563722, 0.6724296, -0.00452661...",0.739892
3,рубаха,"[0.13677494, 0.1374333, 0.39104083, 0.3348376,...",0.722861
4,блуза,"[0.5135839, 0.48293516, 0.51696765, 0.6904851,...",0.706147
...,...,...,...
19967,тупик,"[-0.56923324, -0.3954833, -0.06762549, 0.15723...",-0.198147
19968,чудак,"[-0.10894502, -0.10443008, 0.18266293, 0.12245...",-0.205819
19969,генерал,"[-0.34992936, -0.3075621, -0.27499628, -0.1199...",-0.206981
19970,бизнес,"[-0.039911177, 0.31008914, -0.05241037, 0.2310...",-0.211031


In [87]:
from datetime import datetime

nouns['rank'] = nouns.index + 1
nouns['date'] = datetime.today().strftime('%Y-%m-%d')
nouns

Unnamed: 0,word,embeddings,cosine_similarity,rank,date
0,туника,"[0.3229023, 0.41563722, 0.6724296, 0.25873458,...",1.000000,1,2025-08-04
1,туника,"[0.3229023, 0.41563722, 0.6724296, 0.25873458,...",1.000000,2,2025-08-04
2,накидка,"[0.3229023, 0.41563722, 0.6724296, -0.00452661...",0.739892,3,2025-08-04
3,рубаха,"[0.13677494, 0.1374333, 0.39104083, 0.3348376,...",0.722861,4,2025-08-04
4,блуза,"[0.5135839, 0.48293516, 0.51696765, 0.6904851,...",0.706147,5,2025-08-04
...,...,...,...,...,...
19967,тупик,"[-0.56923324, -0.3954833, -0.06762549, 0.15723...",-0.198147,19968,2025-08-04
19968,чудак,"[-0.10894502, -0.10443008, 0.18266293, 0.12245...",-0.205819,19969,2025-08-04
19969,генерал,"[-0.34992936, -0.3075621, -0.27499628, -0.1199...",-0.206981,19970,2025-08-04
19970,бизнес,"[-0.039911177, 0.31008914, -0.05241037, 0.2310...",-0.211031,19971,2025-08-04


In [88]:
result = nouns[['word', 'rank', 'date']]
result.to_csv('words.csv', index=False)