In [1]:
from nltk.stem.snowball import SnowballStemmer
import csv
import re
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem.porter import *

from deeppavlov.dataset_readers.basic_classification_reader import BasicClassificationDatasetReader
from deeppavlov.dataset_iterators.basic_classification_iterator import BasicClassificationDatasetIterator
from deeppavlov.models.preprocessors.str_lower import str_lower
from deeppavlov.models.tokenizers.nltk_moses_tokenizer import NLTKMosesTokenizer
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from deeppavlov.models.sklearn import SklearnComponent
from deeppavlov.metrics.accuracy import sets_accuracy
import numpy as np

stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
stops.add('рис')
stops.add('университет')
stops.add('брянск')

morph=pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')

dict_stop=set(['метод','определение','условие','момент','значение','результат','критерий',
               'работа','вариант','брянский государственный университет','научнотехнический вестник',
              'соответствие','такой образ','весь критерий','пример','выбор','ключевое слово','период',
              'уравнение','формула','множитель','повышение','оценка','проведение',
              'машина','нагрузка','брянская область','точка','случай','расчет','таблица','расчёт',
              'с показатель','град','обработка','статья','элемент','раз','применение','центр','форма'])


[nltk_data] Downloading package punkt to /home/kirill/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kirill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/kirill/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/kirill/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [2]:
def treatment_text(review):
    review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
    words = review_text.lower().split()
    words = [w for w in words if not w in stops]
    words = [morph.parse(w)[0].normal_form for w in words]
    words = [stemmer.stem(w) for w in words]
    words = [w for w in words if not w in stops]
    return(words)

In [4]:
# создание dataset/x-статья,y-tag(0,1)
# dataset = [sentence_origin,sentence_tr,tag,kws,[kw_in_sentence]]
dataset = []
files=['./articles_2019/articles.csv','./articles_2018/articles.csv','./articles_2017/articles.csv']
for file in files:
    with open(file,'r', encoding='utf-8',newline='') as f:
        reader = csv.reader(f,delimiter=',')
        for row in reader:
            
            if row!=[]:
#                 print('eeest')
                kws=''.join(row[0])
                text = ''.join(row[2])
                sentences_origin = text.split('.')
                sentences_tr = list(map(treatment_text,sentences_origin))
                kws_tr = treatment_text(kws)  
                kws_tr_set=set(kws_tr)
                
                for sentence_tr,sentence_or in zip(sentences_tr,sentences_origin):
                    sentence_tr_set=set(sentence_tr)
                    if len(sentence_tr)>5 and kws_tr_set.intersection(sentence_tr_set):
                        dataset.append((sentence_or,' '.join(sentence_tr),'1',kws,' '.join(kws_tr_set.intersection(sentence_tr_set))))
                    elif len(sentence_tr)>5:
                        dataset.append((sentence_or,' '.join(sentence_tr),'0',kws,'None'))
#             else:
#                 print('pusto')
#                 break


In [5]:
# прочитать dataset из csv файла
dr = BasicClassificationDatasetReader().read(
    data_path='./',
    train='dataset.csv',
    x = 'text',
    y = 'tag',
)



In [6]:
# initialize data iterator splitting `train` field to `train` and `valid` in proportion 0.8/0.2
train_iterator = BasicClassificationDatasetIterator(
    data=dr,
    field_to_split='train',  # field that will be splitted
    split_fields=['train', 'valid'],   # fields to which the fiald above will be splitted
    split_proportions=[0.8, 0.2],  #proportions for splitting
    split_seed=23,  # seed for splitting dataset 23
    seed=42)  # seed for iteration over dataset 42

2020-03-01 23:59:57.881 INFO in 'deeppavlov.dataset_iterators.basic_classification_iterator'['basic_classification_iterator'] at line 74: Splitting field <<train>> to new fields <<['train', 'valid']>>


In [7]:
tokenizer = NLTKMosesTokenizer()
train_x_lower_tokenized = str_lower(tokenizer(train_iterator.get_instances(data_type='train')[0]))

In [8]:
# initialize simple vocabulary to collect all appeared in the dataset classes
classes_vocab = SimpleVocabulary(
    save_path='.classes.dict',
    load_path='./classes.dict')
classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()

2020-03-02 00:00:00.71 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 101: [saving vocabulary to /home/kirill/2019-2-Atom-Backend-K-Kondratenya/key_words/.classes.dict]


In [9]:
# also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset
token_vocab = SimpleVocabulary(
    save_path='./tokens.dict',
    load_path='./tokens.dict',
    min_freq=2,
    special_tokens=('<PAD>', '<UNK>',),
    unk_token='<UNK>')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()
token_vocab.freqs.most_common()[:10]

2020-03-02 00:00:00.495 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 101: [saving vocabulary to /home/kirill/2019-2-Atom-Backend-K-Kondratenya/key_words/tokens.dict]


[('1', 2477),
 ('2', 1728),
 ('0', 1702),
 ('3', 1380),
 ('4', 1163),
 ('5', 951),
 ('элемент', 937),
 ('систем', 928),
 ('государствен', 863),
 ('6', 822)]

In [10]:
# initialize TF-IDF vectorizer sklearn component with `transform` as infer method
tfidf = SklearnComponent(
    model_class="sklearn.feature_extraction.text:TfidfVectorizer",
    infer_method="transform",
    save_path='./tfidf_v0.pkl',
    load_path='./tfidf_v0.pkl',
    mode='train')
tfidf.fit(str_lower(train_iterator.get_instances(data_type='train')[0]))
tfidf.save()

2020-03-02 00:00:01.96 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 166: Initializing model sklearn.feature_extraction.text:TfidfVectorizer from scratch
2020-03-02 00:00:01.304 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 109: Fitting model sklearn.feature_extraction.text:TfidfVectorizer
2020-03-02 00:00:01.868 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 241: Saving model to /home/kirill/2019-2-Atom-Backend-K-Kondratenya/key_words/tfidf_v0.pkl


In [11]:
# get all train and valid data from iterator
x_train, y_train = train_iterator.get_instances(data_type="train")
x_valid, y_valid = train_iterator.get_instances(data_type="valid")


In [12]:
# initialize sklearn classifier, all parameters for classifier could be passed
cls = SklearnComponent(
    model_class="sklearn.linear_model:LogisticRegression",
    infer_method="predict",
    save_path='./logreg_v0.pkl',
    load_path='./logreg_v0.pkl',
    C=1,
    mode='train')

2020-03-02 00:00:03.540 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 166: Initializing model sklearn.linear_model:LogisticRegression from scratch


In [13]:
# fit sklearn classifier and save it
cls.fit(tfidf(x_train), y_train)
cls.save()

2020-03-02 00:00:05.524 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 109: Fitting model sklearn.linear_model:LogisticRegression
2020-03-02 00:00:05.643 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 241: Saving model to /home/kirill/2019-2-Atom-Backend-K-Kondratenya/key_words/logreg_v0.pkl


In [16]:
import pickle

In [17]:
with open('cls_model.pickle', 'wb') as f:
    pickle.dump(cls, f)

In [18]:
with open('tf_idf_model.pickle', 'wb') as f:
    pickle.dump(tfidf, f)