**Проверка качества RAKE**

In [19]:
from rake_nltk import Rake, Metric
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
import pymorphy2
import csv
morph = pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')
from natasha import NamesExtractor
from natasha.markup import show_markup, show_json
for i in ['результат','др','место','небходимость','рисунок','вывод','метр','фанат','осторожность','контент','таблица','схема']:
    stops.add(i)

In [29]:
def def_names(text):
    extractor = NamesExtractor()

    matches = extractor(text)
    facts = [_.fact.as_json for _ in matches]

    names = []
    for i in range(len(facts)):
        if 'first' in facts[i]:
            x = facts[i]['first'].lower()
            names.append(x)
        if 'middle' in facts[i]:
            x = facts[i]['middle'].lower()
            names.append(x)
        if 'last' in facts[i]:
            x = facts[i]['last'].lower()
            names.append(x)
    return names


class RAKE():

    def keywords_extract(self,text):
        names = def_names(text)
        for i in names:
            text.replace(i, '')
        # разделяем текст на токены и приводим к нижнему регистру
        tokenized_text = text.lower()
        # убираем все лишник символы
        tokenized_text = re.sub("[^а-яА-Яa-zA-Z.?!]", " ", tokenized_text)

        tokenized_text=tokenized_text.split()

        # остаяляем как потенциальные КС только сущ и прил т е в список стоп слов добавляем все остальное
        for i in tokenized_text:
            if 'NOUN' not in morph.parse(i)[0].tag and 'ADJF' not in morph.parse(i)[0].tag:
                stops.add(i)
        # print(tokenized_text)
        tokenized_text = ' '.join(tokenized_text)
        r = Rake(ranking_metric=Metric.WORD_FREQUENCY, stopwords=stops, max_length=2)
        # extract keywords from text
        r.extract_keywords_from_text(tokenized_text)

        for i in r.get_ranked_phrases():
            if 'NOUN' not in morph.parse(i)[0].tag and 'ADJF' not in morph.parse(i)[0].tag:
                del i

        keywords = r.get_ranked_phrases()[:20]

        words = [morph.parse(w)[0].normal_form for w in keywords]
        result = set()
        for i in words:
            if 'NOUN' in morph.parse(i)[0].tag and len(i) > 1:
                result.add(i)
        return result



In [30]:
def treatment_text(review):
    review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
    words = review_text.lower().split()
    words = [w for w in words if not w in stops]
    words = [morph.parse(w)[0].normal_form for w in words]
    words = [stemmer.stem(w) for w in words]
    words = [w for w in words if not w in stops]
    return(words)

In [31]:
r = RAKE()

In [40]:
# создание dataset/x-статья,y-tag(0,1)
# dataset = [sentence_origin,sentence_tr,tag,kws,[kw_in_sentence],text]
result = []
file='test_sample_kws.csv'
k=0

with open(file,'r', encoding='cp1251',newline='') as f:
    reader = csv.reader(f,delimiter=';')
    for row in reader:
        if row!=[]:
#             print(k)
            k+=1
            kws_true =''.join(row[0])
            ann = ''.join(row[1])
            text = ''.join(row[2])
            if text!='':
                kws_pred = r.keywords_extract(text=text)
                result.append([kws_true,kws_pred])
            
                
                

In [39]:
len_true = []
len_pred = []
len_intersection = []

for i in range(len(result)-1):
    kws_true = set(treatment_text(result[i][0])) 
    kws_pred = set(treatment_text(' '.join(result[i][1]))) 
    len_true.append(len(kws_true))
    len_pred.append(len(kws_pred))
    len_intersection.append(len(kws_true.intersection(kws_pred)))
#     print(result[i][0])
#     print(result[i][1])
#     print(len(kws_true.intersection(kws_pred)))


recall =sum(len_intersection)/sum(len_true)
precision = sum(len_intersection)/sum(len_pred)
print(f'precision    {precision}')
print(f'recall       {recall}')
print(f'f_measure    {2*precision*recall/(precision+recall)}')

precision    0.15594855305466238
recall       0.24744897959183673
f_measure    0.19132149901380668
