In [258]:
import pandas as pd
import os

In [259]:
import pymorphy2

In [260]:
dummy_input = {
    'goal_name':'обновление устаревшей информации',
    'goal_result':'Да, четко',
    'goal_type':'Развить имеющиеся знания и навыки',
    'goal_first_step':'поиск необходимой информации',
    'goal_domain':'Прикладные знания и навыки, ручной труд',
    'goal_obstacle':'Не вижу преград',
    'goal_time':'Нет жестких сроков',
}

In [261]:
input_df = pd.DataFrame.from_dict(dummy_input, orient='index').T

In [262]:
# input_df

In [263]:
# create features

In [264]:
def you_know_first_steps(x):
    if x:
        if 'не знаю' in x.lower():
            return 0
        else:
            return 1
    else:
        return 0

In [265]:
input_df['are_first_steps_known'] = input_df.loc[:, 'goal_first_step'].apply(lambda x: you_know_first_steps(x))

In [266]:
time_related = ['лет', 'год ', 'меся', 'недел', 'дне', 'года']

In [267]:
def is_time_certain(x):
    if x:
        x = str(x).lower()
        for i in time_related:
            if i in x:
                return 1
            else:
                continue
        return 0
    else:
        return 0

In [268]:
input_df['is_time_certain'] = input_df['goal_time'].apply(lambda x: is_time_certain(x))

In [269]:
def certainly_imagined(x):
    if x:
        if ' четко' in x.lower():
            return 1
        else:
            return 0
    else:
        return 0

In [270]:
input_df['is_certainly_imagined'] = input_df['goal_result'].apply(lambda x: certainly_imagined(x))

In [271]:
def are_obstackles_expected(x):
    if x:
        if 'не вижу преград' in str(x).lower() or 'нет' in str(x).lower():
            return 0
        else:
            return 1
    else:
        return 0

In [272]:
input_df['are_obstackles_expected'] = input_df['goal_obstacle'].apply(lambda x: are_obstackles_expected(x))

In [273]:
input_df.drop(columns=['goal_result', 'goal_first_step', 'goal_obstacle', 'goal_time'], inplace=True)

In [274]:
input_df['space'] = ' '

In [275]:
input_df['name_type'] = input_df['goal_name'] + input_df['space'] + input_df['goal_type']

In [276]:
input_df.drop(columns=['goal_name', 'goal_type', 'space'], inplace=True)

In [277]:
# NLP

In [278]:
input_df['goal_domain'] = input_df['goal_domain'].apply(lambda x: str(x).lower())
input_df['name_type'] = input_df['name_type'].apply(lambda x: str(x).lower())

In [279]:
specials_to_remove = [
    '.', '"', "'", '?', '(', ')', '`',
]

In [280]:
def remove_special(x):
    for special in specials_to_remove:
        if special in x:
            x =  x.replace(special, '').strip()
        else:
            pass
    return x

In [281]:
input_df['goal_domain'] = input_df['goal_domain'].apply(lambda x: remove_special(x))
input_df['name_type'] = input_df['name_type'].apply(lambda x: remove_special(x))

In [282]:
specials_to_replace = [
    '-', '\\', '/', ','
]

In [283]:
def replace_special(x):
    for special in specials_to_replace:
        if special in x:
            x =  x.replace(special, ' ').strip()
        else:
            pass
    return x

In [284]:
input_df['goal_domain'] = input_df['goal_domain'].apply(lambda x: replace_special(x))
input_df['name_type'] = input_df['name_type'].apply(lambda x: replace_special(x))

In [285]:
def create_list_of_words(x):
    return x.split(' ')

In [286]:
input_df['goal_domain_LoW'] = input_df['goal_domain'].apply(lambda x: create_list_of_words(x))
input_df['name_type_LoW'] = input_df['name_type'].apply(lambda x: create_list_of_words(x))

In [287]:
morph = pymorphy2.MorphAnalyzer()

In [288]:
key_pos = ['NOUN', 'VERB', 'NUMR', 'ADJF', 'ADJS', 'INFN']

In [289]:
def clean_LoW_nv(x):
    clean_LoW_nv = []
    for word in x:
        if word.isdigit() == True:
            clean_LoW_nv.append(word)
        else:
            p = morph.parse(word)[0]
            normal_form = p.normal_form
            pos = p.tag
            stop = 0
            for s_pos in key_pos:
                if s_pos in pos:
                    clean_LoW_nv.append(normal_form)
                else:
                    continue
    return ' '.join(clean_LoW_nv)

In [290]:
input_df['goal_domain_clean_NV_LoW'] = input_df['goal_domain_LoW'].apply(lambda x: clean_LoW_nv(x))
input_df['name_type_clean_NV_LoW'] = input_df['name_type_LoW'].apply(lambda x: clean_LoW_nv(x))

In [291]:
# New features

In [292]:
def word_counter(x):
    return len(x)

In [293]:
input_df['topic_words'] = input_df['goal_domain_LoW'].apply(lambda x: word_counter(x))
input_df['goal_words'] = input_df['name_type_LoW'].apply(lambda x: word_counter(x))

In [294]:
def letters_counter(x):
    counter = int()
    for word in x:
        counter += len(word)
    return counter

In [295]:
input_df['topic_letters'] = input_df['goal_domain_LoW'].apply(lambda x: letters_counter(x))
input_df['goal_letters'] = input_df['name_type_LoW'].apply(lambda x: letters_counter(x))

In [296]:
input_df['topic_aver_word_len'] = round(input_df['topic_letters'].div(input_df['topic_words']), 2)
input_df['goal_aver_word_len'] = round(input_df['goal_letters'].div(input_df['goal_words']), 2)

In [297]:
def pos_counter(x, pos_to_comp):
    pos_counter = int()
    for word in x:
        p = morph.parse(word)[0]
        pos = p.tag
        for pos_ in pos_to_comp:
            if pos_ in pos:
                pos_counter += 1
            else:
                pass
    return pos_counter

In [298]:
input_df['goal_verbs_counter'] = input_df['name_type_LoW'].apply(lambda x: pos_counter(x, ['VERB', 'INFN']))

In [299]:
input_df['goal_nouns_counter'] = input_df['name_type_LoW'].apply(lambda x: pos_counter(x, ['NOUN']))

In [300]:
input_df['goal_numr_counter'] = input_df['name_type_LoW'].apply(lambda x: pos_counter(x, ['NUMR']))

In [301]:
input_df['goal_adj_counter'] = input_df['name_type_LoW'].apply(lambda x: pos_counter(x, ['ADJF', 'ADJS']))

In [302]:
def digit_counter(x):
    digit_counter = int()
    for word in x:
        if word.isdigit() == True:
            digit_counter += 1
    return digit_counter

In [303]:
input_df['goal_digit_counter'] = input_df['name_type_LoW'].apply(lambda x: digit_counter(x))

In [304]:
# Create 3 dataset

In [305]:
input_df.columns

Index(['goal_domain', 'are_first_steps_known', 'is_time_certain',
       'is_certainly_imagined', 'are_obstackles_expected', 'name_type',
       'goal_domain_LoW', 'name_type_LoW', 'goal_domain_clean_NV_LoW',
       'name_type_clean_NV_LoW', 'topic_words', 'goal_words', 'topic_letters',
       'goal_letters', 'topic_aver_word_len', 'goal_aver_word_len',
       'goal_verbs_counter', 'goal_nouns_counter', 'goal_numr_counter',
       'goal_adj_counter', 'goal_digit_counter'],
      dtype='object')

In [306]:
from sklearn.preprocessing import MinMaxScaler

In [307]:
# Features

In [308]:
df_features = input_df[['are_first_steps_known', 'is_time_certain',
               'is_certainly_imagined', 'are_obstackles_expected',
                'topic_words', 'goal_words', 'topic_letters',
               'goal_letters', 'topic_aver_word_len', 'goal_aver_word_len',
               'goal_verbs_counter', 'goal_nouns_counter', 'goal_numr_counter',
               'goal_adj_counter', 'goal_digit_counter']]

In [309]:
# df_features

In [310]:
mms = MinMaxScaler()
features = mms.fit_transform(df_features.values)

In [338]:
features.shape

(1, 15)

In [311]:
# Vectors

In [312]:
df_vectors = input_df[['name_type_clean_NV_LoW']] # only name-type for now

In [313]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [314]:
text = df_vectors['name_type_clean_NV_LoW']

In [315]:
with open('models/tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)

In [316]:
vectors = tfidf.transform(text)

In [317]:
vectors.shape

(1, 4932)

In [318]:
# Neural net

In [319]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [320]:
df_vectors = input_df[['name_type_clean_NV_LoW']]

In [321]:
text = df_vectors['name_type_clean_NV_LoW']

In [322]:
token = Tokenizer()
token.fit_on_texts(text)

In [323]:
encoded_text = token.texts_to_sequences(text)

In [324]:
max_length = 142
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [325]:
import numpy as np

In [326]:
X = np.asarray(X)

In [327]:
from tensorflow.keras.models import load_model

In [328]:
##########################################

In [329]:
topic_tool_vect_nn = load_model('models/topic_tool_vect_nn.h5')

In [330]:
np.argmax(topic_tool_vect_nn.predict(X), axis = -1)

array([0], dtype=int64)

In [331]:
import pickle

In [332]:
with open('models/topic_tool_vect_xgb.pkl', 'rb') as f:
    tool_vect_xgb = pickle.load(f)

In [333]:
tool_vect_xgb.predict(vectors)

array([0], dtype=int64)

In [334]:
with open('models/specific_feat_xgb.pkl', 'rb') as f:
    specific_feat_xgb = pickle.load(f)

In [335]:
specific_feat_xgb.predict(features)

array([1], dtype=int64)

In [336]:
with open('models/topics_tesaurus.pickle', 'rb') as f:
    topics_tesaurus = pickle.load(f)

In [337]:
topics_tesaurus

{'label_attractor_knowledge': ['знан',
  'зако',
  'прав',
  'нау',
  'образов',
  'биолог',
  'философ',
  'социолог',
  'изуч',
  'литерат',
  'язык',
  'учеб',
  'экономика',
  'экология',
  'кулинар',
  'энергети',
  'электро',
  'обуч',
  'безопасн',
  'хими',
  'стоматол',
  'педиат',
  'вет',
  'логопед',
  'строител',
  'ремонт',
  'культур',
  'учёб',
  'гомеопат'],
 'label_attractor_hard_skill': ['математи',
  'программирова',
  'ии',
  'ит',
  'it',
  'дизайн',
  'графи',
  'создан',
  'юрис',
  'медицин',
  'психо',
  'педагоги',
  'предпринимател',
  'воспита',
  'разработ',
  'по',
  'прикладн',
  'инженер',
  'hard',
  'информац',
  'иннов',
  'ритор',
  'автомех',
  'флори',
  'фарма',
  'косметол',
  'логист',
  'цифр',
  'машин',
  'металл',
  'механ',
  'гео',
  'экос',
  'агро',
  'меха',
  'дефект',
  'свар',
  'хлебопеч',
  'шахмат',
  'освоен'],
 'label_attractor_soft_skill': ['soft',
  'саморазвит',
  'дисциплин',
  'общен',
  'дипломат',
  'коммуни',
  'soft',
