In [2]:
from nltk.stem.snowball import SnowballStemmer
import csv
import re
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem.porter import *

from deeppavlov.dataset_readers.basic_classification_reader import BasicClassificationDatasetReader
from deeppavlov.dataset_iterators.basic_classification_iterator import BasicClassificationDatasetIterator
# from deeppavlov.models.preprocessors.str_lower import str_lower
from deeppavlov.models.tokenizers.nltk_moses_tokenizer import NLTKMosesTokenizer
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from deeppavlov.models.sklearn import SklearnComponent
from deeppavlov.metrics.accuracy import sets_accuracy
import numpy as np

stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
stops.add('рис')
stops.add('университет')
stops.add('брянск')

morph=pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')

dict_stop=set(['метод','определение','условие','момент','значение','результат','критерий',
               'работа','вариант','брянский государственный университет','научнотехнический вестник',
              'соответствие','такой образ','весь критерий','пример','выбор','ключевое слово','период',
              'уравнение','формула','множитель','повышение','оценка','проведение',
              'машина','нагрузка','брянская область','точка','случай','расчет','таблица','расчёт',
              'с показатель','град','обработка','статья','элемент','раз','применение','центр','форма'])


In [3]:
def treatment_text(review):
    review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
    words = review_text.lower().split()
    words = [w for w in words if not w in stops]
    words = [morph.parse(w)[0].normal_form for w in words]
    words = [stemmer.stem(w) for w in words]
    words = [w for w in words if not w in stops]
    return(words)

In [6]:
# создание dataset/x-статья,y-tag(0,1)
# dataset = [sentence_origin,sentence_tr,tag,kws,[kw_in_sentence]]
dataset = []
k = 'C:/Users/Ирина/PycharmProjects/UIR7sem/venv'
files=[k+'/articles_2019/articles.csv',k+'/articles_2018/articles.csv',k+'/articles_2017/articles.csv']
for file in files:
    with open(file,'r', encoding='utf-8',newline='') as f:
        reader = csv.reader(f,delimiter=',')
        for row in reader:
            
            if row!=[]:
#                 print('eeest')
                kws=''.join(row[0])
                text = ''.join(row[2])
                sentences_origin = text.split('.')
                sentences_tr = list(map(treatment_text,sentences_origin))
                kws_tr = treatment_text(kws)  
                kws_tr_set=set(kws_tr)
                
                for sentence_tr,sentence_or in zip(sentences_tr,sentences_origin):
                    sentence_tr_set=set(sentence_tr)
                    if len(sentence_tr)>5 and kws_tr_set.intersection(sentence_tr_set):
                        dataset.append((sentence_or,' '.join(sentence_tr),'1',kws,' '.join(kws_tr_set.intersection(sentence_tr_set))))
                    elif len(sentence_tr)>5:
                        dataset.append((sentence_or,' '.join(sentence_tr),'0',kws,'None'))
#             else:
#                 print('pusto')
#                 break


In [7]:
# прочитать dataset из csv файла
dr = BasicClassificationDatasetReader().read(
    data_path=k,
    train='dataset.csv',
    x = 'text',
    y = 'tag',
)



In [8]:
# initialize data iterator splitting `train` field to `train` and `valid` in proportion 0.8/0.2
train_iterator = BasicClassificationDatasetIterator(
    data=dr,
    field_to_split='train',  # field that will be splitted
    split_fields=['train', 'valid'],   # fields to which the fiald above will be splitted
    split_proportions=[0.8, 0.2],  #proportions for splitting
    split_seed=23,  # seed for splitting dataset 23
    seed=42)  # seed for iteration over dataset 42

2020-05-06 22:41:44.877 INFO in 'deeppavlov.dataset_iterators.basic_classification_iterator'['basic_classification_iterator'] at line 73: Splitting field <<train>> to new fields <<['train', 'valid']>>


In [10]:
tokenizer = NLTKMosesTokenizer()
train_x_lower_tokenized = tokenizer(train_iterator.get_instances(data_type='train')[0])

In [11]:
# initialize simple vocabulary to collect all appeared in the dataset classes
classes_vocab = SimpleVocabulary(
    save_path='.classes.dict',
    load_path='./classes.dict')
classes_vocab.fit((train_iterator.get_instances(data_type='train')[1]))
classes_vocab.save()

2020-05-06 22:42:06.603 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 98: [saving vocabulary to C:\Users\Ирина\Desktop\keywords\.classes.dict]


In [12]:
# also one can collect vocabulary of textual tokens appeared 2 and more times in the dataset
token_vocab = SimpleVocabulary(
    save_path='./tokens.dict',
    load_path='./tokens.dict',
    min_freq=2,
    special_tokens=('<PAD>', '<UNK>',),
    unk_token='<UNK>')
token_vocab.fit(train_x_lower_tokenized)
token_vocab.save()
token_vocab.freqs.most_common()[:10]

2020-05-06 22:42:06.695 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 112: [loading vocabulary from C:\Users\Ирина\Desktop\keywords\tokens.dict]
2020-05-06 22:42:07.687 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 98: [saving vocabulary to C:\Users\Ирина\Desktop\keywords\tokens.dict]


[('1', 2477),
 ('2', 1728),
 ('0', 1702),
 ('3', 1380),
 ('4', 1163),
 ('5', 951),
 ('элемент', 937),
 ('систем', 928),
 ('государствен', 863),
 ('6', 822)]

In [14]:
# initialize TF-IDF vectorizer sklearn component with `transform` as infer method
tfidf = SklearnComponent(
    model_class="sklearn.feature_extraction.text:TfidfVectorizer",
    infer_method="transform",
    save_path='./tfidf_v0.pkl',
    load_path='./tfidf_v0.pkl',
    mode='train')
tfidf.fit(train_iterator.get_instances(data_type='train')[0])
tfidf.save()

2020-05-06 22:42:24.134 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 202: Loading model sklearn.feature_extraction.text:TfidfVectorizer from C:\Users\Ирина\Desktop\keywords\tfidf_v0.pkl
2020-05-06 22:42:24.190 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 209: Model sklearn.feature_extraction.textTfidfVectorizer loaded  with parameters
2020-05-06 22:42:24.574 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 108: Fitting model sklearn.feature_extraction.textTfidfVectorizer
2020-05-06 22:42:26.294 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 240: Saving model to C:\Users\Ирина\Desktop\keywords\tfidf_v0.pkl


In [33]:
# get all train and valid data from iterator
x_train, y_train = train_iterator.get_instances(data_type="train")
x_valid, y_valid = train_iterator.get_instances(data_type="valid")


In [34]:
# initialize sklearn classifier, all parameters for classifier could be passed
cls = SklearnComponent(
    model_class="sklearn.linear_model:LogisticRegression",
    infer_method="predict",
    save_path='./logreg_v0.pkl',
    load_path='./logreg_v0.pkl',
    C=1,
    mode='train')

2020-05-06 22:46:19.883 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 202: Loading model sklearn.linear_model:LogisticRegression from C:\Users\Ирина\Desktop\keywords\logreg_v0.pkl
2020-05-06 22:46:19.895 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 209: Model sklearn.linear_model.logisticLogisticRegression loaded  with parameters


In [35]:
# fit sklearn classifier and save it
cls.fit(tfidf(x_train), y_train)
cls.save()

2020-05-06 22:46:24.375 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 108: Fitting model sklearn.linear_model.logisticLogisticRegression
2020-05-06 22:46:24.603 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 240: Saving model to C:\Users\Ирина\Desktop\keywords\logreg_v0.pkl


In [36]:
# with open('model.pickle', 'wb') as f:
#     pickle.dump(cls, f)

In [37]:
# with open('tfidf.pickle', 'wb') as f:
#     pickle.dump(tfidf, f)

In [38]:
pred=cls(tfidf(x_valid))

In [39]:
y_valid = list(map(lambda x: int(x[0]),y_valid))
pred = list(map(lambda x: int(x[0]),pred))
print(classification_report(y_valid, pred, labels=[0, 1]))


              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1362
           1       0.84      0.82      0.83      1172

    accuracy                           0.85      2534
   macro avg       0.85      0.84      0.84      2534
weighted avg       0.85      0.85      0.85      2534



In [30]:
from sklearn.metrics import classification_report


In [None]:
import requests
import pandas as pd
from google.oauth2 import service_account

project_id = 'arctic-task-238719'
private_key='arctic-task-238719-e6a1c5fe056b.json'
from google.cloud import bigquery
credentials = service_account.Credentials.from_service_account_file('./arctic-task-238719-e6a1c5fe056b.json')
from pandas.io import gbq
import pickle
import re
from deeppavlov.models.sklearn import SklearnComponent
from nltk.stem.snowball import SnowballStemmer
import csv
import re
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem.porter import *
import pickle

stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
stops.add('рис')
stops.add('университет')
stops.add('брянск')

morph=pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')

def treatment_text(review):
    review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
    words = review_text.lower().split()
    words = [w for w in words if not w in stops]
    words = [morph.parse(w)[0].normal_form for w in words]
    words = [stemmer.stem(w) for w in words]
    words = [w for w in words if not w in stops]
    return(' '.join(words))


with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)

# print(df['keywords'] .head(100))


# initialize TF-IDF vectorizer sklearn component with `transform` as infer method
# tfidf_knn = SklearnComponent(
#     model_class="sklearn.feature_extraction.text:TfidfVectorizer",
#     infer_method="transform",
#     save_path='./tfidf_knn.pkl',
#     load_path='./tfidf_knn.pkl',
#     mode='train')
#
x = df['keywords'][:-1].values.tolist()
x = tuple(x)
# tfidf_knn.fit(x)
# tfidf_knn.save()
# with open('tfidf_knn.pickle', 'wb') as f:
#     pickle.dump(tfidf_knn, f)

with open('tfidf_knn.pickle', 'rb') as f:
    tfidf_knn = pickle.load(f)
y = tfidf_knn(('физика частицы квант взрыв',))
print(y)
#
# # from sklearn.neighbors import NearestNeighbors
# # x_pre =tfidf_knn(x)
#
# # neigh = NearestNeighbors(n_neighbors=5).fit(x_pre)
# x_pre = x_pre.toarray()
# if x_pre[0].all()==x_pre[1000].all():
#     print(True)
# answer=neigh.kneighbors(y)
# print(neigh.kneighbors(y))
# print(len(x_pre))
# print('x_pre[i]',x_pre[10000])
#
# for i in range(x_pre.shape[0]):
#     if x_pre[i]==answer[0]:
#         print(x[i])
# # print(tfidf_knn.inverse_transform(y))

#
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.neighbors import NearestNeighbors
# import pandas as pd
# import numpy as np
#
# count_vectorizer = CountVectorizer()
# #Apply this vectorizer to text to get a sparse matrix of counts
# count_matrix = count_vectorizer.fit_transform(df['keywords'])
# #Get the names of the features
# features = count_vectorizer.get_feature_names()
# #Create a series from the sparse matrix
# d = pd.Series(count_matrix.toarray().flatten(),index = features).sort_values(ascending=False)
#


In [10]:
import pickle
import pandas as pd
import pickle
import re
from deeppavlov.models.sklearn import SklearnComponent
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem.porter import *


stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
stops.add('рис')
stops.add('университет')
stops.add('брянск')

morph=pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')

def treatment_text(review):
    try:
        review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
        words = review_text.lower().split()
        words = [w for w in words if not w in stops]
        words = [morph.parse(w)[0].normal_form for w in words]
        words = [stemmer.stem(w) for w in words]
        words = [w for w in words if not w in stops]
        return(' '.join(words))
    except:
        return review

In [11]:

with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)
df['title_tr'] = df['title'].apply(treatment_text)
# df['title_kws']=df['title'].apply(treatment_text)+df['keywords']

In [12]:
df['title_kws'] = df['title_tr'] + df['keywords']
df.head(15)

Unnamed: 0,author,title,keywords,title_tr,title_kws
0,Калачев Глеб Вячеславович,О мощностной сложности плоских схем,дискретн математик математическ кибернетик выч...,мощностн сложност плоск схем,мощностн сложност плоск схемдискретн математик...
1,Лутцева Елена Андреевна,Педагогические основы взаимосвязи урочной и вн...,методик преподаван отрасл наук 13 00 02 теор м...,педагогическ основ взаимосвяз урочн внеурочн т...,педагогическ основ взаимосвяз урочн внеурочн т...
2,Малинин Алексей Николаевич,Релятивистские идеи в курсе теоретической физи...,методик преподаван отрасл наук 13 00 02 теор м...,релятивистск иде курс теоретическ физик педвуз,релятивистск иде курс теоретическ физик педвуз...
3,Царева Светлана Евгеньевна,Формирование учебной деятельности младших школ...,методик преподаван отрасл наук 13 00 02 теор м...,формирован учебн деятельн младш школьник обуче...,формирован учебн деятельн младш школьник обуче...
4,Лебедко Валерий Константинович,Формирование пространственных представлений на...,методик преподаван отрасл наук 13 00 02 теор м...,формирован пространствен представлен занят рис...,формирован пространствен представлен занят рис...
5,Подольский Александр Иванович,Организация учебной деятельности школьников пр...,методик преподаван отрасл наук 13 00 02 теор м...,организац учебн деятельн школьник формирован п...,организац учебн деятельн школьник формирован п...
6,Крахоткина Валентина Кузьминична,Учебно-исследовательская работа студентов по м...,методик преподаван отрасл наук 13 00 02 теор м...,учебн исследовательск работ студент методик пр...,учебн исследовательск работ студент методик пр...
7,Никифорова Валентина Михайловна,Совершенствование преподавания электрорадиотех...,методик преподаван отрасл наук 13 00 02 теор м...,совершенствован преподаван электрорадиотехник ...,совершенствован преподаван электрорадиотехник ...
8,Абдукаримов Мамадали,Формирование логических приемов мышления у уча...,методик преподаван отрасл наук 13 00 02 теор м...,формирован логическ мышлен уча 6 8 класс обуче...,формирован логическ мышлен уча 6 8 класс обуче...
9,Ильигорский Юрий Константинович,Особенности организации учебного эксперимента ...,методик преподаван отрасл наук 13 00 02 теор м...,особен организац учебн эксперимент школ углубл...,особен организац учебн эксперимент школ углубл...


In [13]:
with open('dataset.pickle', 'wb') as f:
    pickle.dump(df, f)

In [14]:
with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)

In [15]:
tfidf_knn = SklearnComponent(
    model_class="sklearn.feature_extraction.text:TfidfVectorizer",
    infer_method="transform",
    save_path='./tfidf_knn.pkl',
    load_path='./tfidf_knn.pkl',
    mode='train')

2020-03-02 13:05:09.852 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 203: Loading model sklearn.feature_extraction.text:TfidfVectorizer from C:\Users\Ирина\PycharmProjects\UIR7sem\venv\tfidf_knn.pkl
2020-03-02 13:05:10.4 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 210: Model sklearn.feature_extraction.textTfidfVectorizer loaded  with parameters


In [17]:
x = df['title_kws'].values.tolist()
x = tuple(x)
tfidf_knn.fit(x)
tfidf_knn.save()

2020-03-02 13:05:59.313 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 109: Fitting model sklearn.feature_extraction.textTfidfVectorizer
2020-03-02 13:06:01.209 INFO in 'deeppavlov.models.sklearn.sklearn_component'['sklearn_component'] at line 241: Saving model to C:\Users\Ирина\PycharmProjects\UIR7sem\venv\tfidf_knn.pkl


In [42]:
x = df['title_kws'].values.tolist()
for i in range(len(x)):
    if isinstance(x[i],str) is False:
        x[i]='NONE'

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(x)

In [21]:
train = tfidf_knn(x).toarray()

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [49]:
 neigh = NearestNeighbors(n_neighbors=5,algorithm='ball_tree').fit(train)

KeyboardInterrupt: 

In [None]:
y = ('математика интегральное исчисление интеграл',)
y = tfidf_knn(y).toarray()


In [58]:
len(y[0])

26058

In [None]:
neigh.kneighbors(y)

In [86]:
neigh = NearestNeighbors(n_neighbors=5).fit(X)
text = treatment_text('колебание коэффициент демпфирования модуль упругость установка')
y = (text,)
y = vectorizer.transform(y)
y =y.toarray()
neigh.kneighbors(y)

(array([[1.24409592, 1.27462216, 1.28910011, 1.28975918, 1.2927336 ]]),
 array([[ 172, 6991, 3049, 1212, 2549]], dtype=int64))

In [87]:
# X.toarray()[9353]
#  array([[9353, 8997, 9269, 9799, 8963]], dtype=int64))

In [91]:
df['title_kws'][3049]

'вычислительн схем линейн программирован перемен коэффициент примененматематическ кибернетик'

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import pandas as pd
import pickle
import re
from deeppavlov.models.sklearn import SklearnComponent
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pymorphy2
from nltk.stem.porter import *


stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
stops.add('рис')
stops.add('университет')
stops.add('брянск')

morph=pymorphy2.MorphAnalyzer()
stemmer=SnowballStemmer('russian')

def treatment_text(review):
    try:
        review_text = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", review)
        words = review_text.lower().split()
        words = [w for w in words if not w in stops]
        words = [morph.parse(w)[0].normal_form for w in words]
        words = [stemmer.stem(w) for w in words]
        words = [w for w in words if not w in stops]
        return(' '.join(words))
    except:
        return review
from google.oauth2 import service_account
project_id = 'arctic-task-238719'
private_key='arctic-task-238719-e6a1c5fe056b.json'
import json
from google.cloud import bigquery
credentials = service_account.Credentials.from_service_account_file('./arctic-task-238719-e6a1c5fe056b.json')
from pandas.io import gbq

In [8]:
Query = 'SELECT * FROM dataset.search_rsl_ru '
        
df = gbq.read_gbq(Query, project_id, credentials=credentials)

  This is separate from the ipykernel package so we can avoid doing imports until
Downloading: 100%|███████████████████| 10955/10955 [00:04<00:00, 2702.16rows/s]


In [9]:
df['keyword_tr'] = df['keywords'].apply(treatment_text)
df['title_tr'] = df['title'].apply(treatment_text)
df['title_kws'] = df['keyword_tr'] + df['title_tr']


In [10]:
with open('dataset.pickle', 'wb') as f:
    pickle.dump(df, f)

In [11]:
with open('dataset.pickle', 'rb') as f:
    df = pickle.load(f)

In [14]:
x = df['title_kws'].values.tolist()
for i in range(len(x)):
    if isinstance(x[i],str) is False:
        x[i]='NONE'

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(x)

In [16]:
neigh = NearestNeighbors(n_neighbors=5).fit(X)
text = treatment_text('колебание коэффициент демпфирования модуль упругость установка')
y = (text,)
y = vectorizer.transform(y)
y =y.toarray()
neigh.kneighbors(y)

(array([[1.24753628, 1.26874884, 1.2740085 , 1.27424685, 1.29116827]]),
 array([[ 172, 6991, 9400, 2677, 3049]], dtype=int64))

In [17]:
with open('vectorizer.pickle', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('vectorizer.pickle', 'rb') as f:
    vectorizer = pickle.load(f)
with open('knn.pickle', 'wb') as f:
    pickle.dump(neigh, f)
with open('knn.pickle', 'rb') as f:
    neigh = pickle.load(f)

In [72]:
with open('train.pickle', 'wb') as f:
    pickle.dump(X, f)
with open('train.pickle', 'rb') as f:
    X = pickle.load(f)


In [19]:

# with open('vectorizer.pickle', 'rb') as f:
#     vectorizer = pickle.load(f)

# with open('knn.pickle', 'rb') as f:
#     neigh = pickle.load(f)
def find_similar(text,kws,ann):

    s = kws+ann
    s = treatment_text(s)
    s = vectorizer.transform([s])
    s = s.toarray()
    result = neigh.kneighbors(y)[1][0]
    r =[]
    for i in result:
        r.append(df.loc[i][['author','title','keywords']].values.tolist())
    return r
r =find_similar('колебание коэффициент демпфирования модуль упругость установка','','')

Unnamed: 0,1,2,3
0,Перминова Мария Юрьевна,Алгоритмы и программный модуль получения явных...,"теоретические основы информатики, физико-матем..."
1,Фаворская Алена Владимировна,Метод исследования пространственных волновых я...,"математическое моделирование, численные методы..."
2,Финошин Александр Викторович,Адаптивное управление нелинейными колебаниями,"системный анализ, управление и обработка инфор..."
3,Глотова Людмила Сергеевна,Рассеяние энергии механических колебаний в мяг...,физика магнитных явлений
4,Мартынов Анатолий Поликарпович,Вычислительные схемы линейного программировани...,математическая кибернетика


In [70]:
from google.oauth2 import service_account
from google.cloud import bigquery
import pandas as pd
import pandas_gbq as gbq

project_id = 'arctic-task-238719'
private_key='arctic-task-238719-e6a1c5fe056b.json'
credentials = service_account.Credentials.from_service_account_file('./arctic-task-238719-e6a1c5fe056b.json')

pandas_gbq.context.credentials = credentials
pandas_gbq.context.project = project_id


def upload_user_bd(list_of_lists,username):
    try:
        df = pd.DataFrame(list_of_lists, columns=['authors','title','keywords'])
        gbq.to_gbq(df,'dataset.'+username, project_id , if_exists = 'append'   )
        return True
    except TransportError:
        return False
    except:
        return -1

In [71]:
upload_user_bd(r,'kirill')

1it [00:06,  6.54s/it]


True

In [85]:
from google.oauth2 import service_account
project_id = 'arctic-task-238719'
private_key='arctic-task-238719-e6a1c5fe056b.json'
import json
from google.cloud import bigquery
credentials = service_account.Credentials.from_service_account_file('./arctic-task-238719-e6a1c5fe056b.json')
from pandas.io import gbq
from sklearn.feature_extraction.text import TfidfVectorizer

def similar_articles_from_user_library(username,text,kws,ann):
    try:
        Query = 'SELECT * FROM dataset.'+username
        df = gbq.read_gbq(Query, project_id, credentials=credentials)
        df['title_kws'] = df['title']+df['keywords']
        df['title_kws'] = df['title_kws'].apply(treatment_text)
        x = df['title_kws'].values.tolist()
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(x)
        neigh = NearestNeighbors(n_neighbors=5).fit(X)
        s = treatment_text(kws+ann)
        s = vectorizer.transform([s]).toarray()
        result = neigh.kneighbors(s)[1][0]
        r =[]
        for i in result:
            r.append(df.loc[i][['authors','title','keywords']].values.tolist())
        return r
    except TransportError:
        return False
    except:
        return -1

    

In [105]:
# # search_articles
# import re
# from nltk.corpus import stopwords
# import pymorphy2
# from google.oauth2 import service_account
# project_id = 'arctic-task-238719'
# private_key='arctic-task-238719-e6a1c5fe056b.json'
# import json
# from google.cloud import bigquery
# credentials = service_account.Credentials.from_service_account_file('./arctic-task-238719-e6a1c5fe056b.json')
# from pandas.io import gbq
# stops = set(stopwords.words("english")) | set(stopwords.words("russian"))
# import pandas as pd
# morph=pymorphy2.MorphAnalyzer()

# def search(word='',mode='title'):
#     word = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", word)
#     words = word.lower().split()
#     words = [w for w in words if not w in stops]
#     if words=='':
#         return "некорректный ввод"

#     if mode=='author':
#         Query = 'SELECT * FROM dataset.search_rsl_ru WHERE AUTHOR LIKE \''
#         for word in words:
#             Query+='%{}'.format(word)
#         Query += '%\''
#         df = gbq.read_gbq(Query, project_id, credentials=credentials)

#     if mode=='title':
#         words = [morph.parse(w)[0].normal_form for w in words]
#         Query = 'SELECT * FROM dataset.search_rsl_ru WHERE TITLE LIKE \''
#         for word in words:
#             Query+='%{}'.format(word)
#         Query += '%\''
#         df = gbq.read_gbq(Query, project_id, credentials=credentials)

#     if mode=='kws':
#         words = [morph.parse(w)[0].normal_form for w in words]
#         Query = 'SELECT * FROM dataset.search_rsl_ru WHERE KEYWORDS LIKE \''
#         for word in words:
#             Query+='%{}'.format(word)
#         Query +='%\''
#         df = gbq.read_gbq(Query, project_id, credentials=credentials)
#     result = df.values.tolist()
#     if result==[]:
#         return False
#     else:
#         return result

# search(mode='kws',word='колебания')



Downloading: 0rows [00:01, ?rows/s]


False

In [143]:

stemmer=SnowballStemmer('russian')

def search_user_library(username,q='',mode='title'):
    try:
        q = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", q)
        q = q.lower()
        words=q.split()
        words = [w for w in words if not w in stops]
        words = [stemmer.stem(w) for w in words]
        if words=='':
            return "некорректный ввод"

        if mode=='author':
            Query = 'SELECT * FROM dataset.'+username+' WHERE AUTHORS LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.'+username+' WHERE AUTHORS LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)

        if mode=='title':
    #         words = [morph.parse(w)[0].normal_form for w in words]
            Query = 'SELECT * FROM dataset.'+username+' WHERE TITLE LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.'+username+' WHERE TITLE LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)

        if mode=='kws':
    #         words = [morph.parse(w)[0].normal_form for w in words]
            Query = 'SELECT * FROM dataset.'+username+' WHERE KEYWORDS LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.'+username+' WHERE KEYWORDS LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)
        result = df.values.tolist()
        if result==[]:
            return 'не найдено'
        else:
            return result
    except TransportError:
        return False
    except:
        return -1

search_user_library(username='kirill',mode='kws',q='колебания механических')

# search_user_library(username='kirill',mode='title',word='физика ')

SELECT * FROM dataset.kirill WHERE KEYWORDS LIKE '%колебан%механическ%'


Downloading: 100%|█████████████████████████████| 3/3 [00:01<00:00,  2.65rows/s]


[['Финошин Александр Викторович', 'Адаптивное управление нелинейными колебаниями ', 'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,применение эвм'], ['Финошин Александр Викторович', 'Адаптивное управление нелинейными колебаниями ', 'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,применение эвм'], ['Финошин Александр Викторович', 'Адаптивное управление нелинейными колебаниями ', 'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,применение эвм'

[['Финошин Александр Викторович',
  'Адаптивное управление нелинейными колебаниями ',
  'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,применение эвм'],
 ['Финошин Александр Викторович',
  'Адаптивное управление нелинейными колебаниями ',
  'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,применение эвм'],
 ['Финошин Александр Викторович',
  'Адаптивное управление нелинейными колебаниями ',
  'системный анализ, управление и обработка информации, физико-математические науки,механика,теоретическая механика,динамика,динамика системы точек и твердого тела,колебания механических систем,нелинейные колебания механических систем,п

In [141]:
stemmer.stem('колебания механических')

'колебания механическ'

In [146]:

stemmer=SnowballStemmer('russian')

def search(q='',mode='title'):
    try:
        q = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", q)
        q = q.lower()
        words=q.split()
        words = [w for w in words if not w in stops]
        words = [stemmer.stem(w) for w in words]
        if words=='':
            return "некорректный ввод"

        if mode=='author':
            Query = 'SELECT * FROM dataset.search_rsl_ru WHERE AUTHORS LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.search_rsl_ru WHERE AUTHORS LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)

        if mode=='title':
    #         words = [morph.parse(w)[0].normal_form for w in words]
            Query = 'SELECT * FROM dataset.search_rsl_ru WHERE TITLE LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.search_rsl_ru WHERE TITLE LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)

        if mode=='kws':
    #         words = [morph.parse(w)[0].normal_form for w in words]
            Query = 'SELECT * FROM dataset.search_rsl_ru WHERE KEYWORDS LIKE \''
            for word in words:
                Query+='%{}'.format(word)
            Query += '%\''
            print(Query)
            df = gbq.read_gbq(Query, project_id, credentials=credentials)
            print(df.values.tolist())
            if df.values.tolist()==[]:
                Query = 'SELECT * FROM dataset.search_rsl_ru WHERE KEYWORDS LIKE \'%{}%\''.format(q)
                print(Query)
                df = gbq.read_gbq(Query, project_id, credentials=credentials)
        result = df.values.tolist()
        if result==[]:
            return 'не найдено'
        else:
            return result
    except TransportError:
        return False
    except:
        return -1

# search(mode='kws',q='колебания механических')

# search_user_library(username='kirill',mode='title',word='физика ')