## Test trained model

In [1]:
import tensorflow as tf 
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Embedding,Bidirectional,LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tree_stem import stem_word, word_to_vec
from tokenize_uk import tokenize_words
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


In [2]:
tf.__version__

'2.14.0'

In [3]:
df = pd.read_csv('messages_dataset.csv') # load dataset
class_name = ['economics', 'politics', 'sports', 'entertainment', 'technology']
ukrainian_stopwords = list(pd.read_csv("stopwords_ua.txt", header=None, names=['stopwords']).stopwords)

In [None]:
def get_dataset_equally(df):
    grouped = df.groupby(['topic'])
    smallest = grouped.count().min().values
    try: # Pandas 1.1.0+
        return grouped.sample(smallest)
    except AttributeError: # Pre-Pandas 1.1.0
        return grouped.apply(lambda df: df.sample(smallest))
df = get_dataset_equally(df)
df.topic.value_counts()

In [11]:

# clean dataset messages
def clean_message(message):
    text = re.sub("https?:\/\/[\w+.\/]+", " ", str(message))
    text = re.sub('[^a-z–∞–±–≤–≥“ë–¥–µ—î–∂–∑–∏—ñ—ó–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—å—é—è]', ' ', str(text)).lower()
    sen = re.sub(' +',' ',str(text))
    words = tokenize_words(sen)
    sen = []
    for word in words:
        if word not in ukrainian_stopwords:
            try:
                sen.append(stem_word(word))
            except:
                sen.append(word)

    text = ' '.join(sen)
    return text.strip()


In [21]:
corpus = [] 
x = df.message
y = df.topic

for msg in x:
    corpus.append(clean_message(msg))
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)

In [22]:
#load model
model = tf.keras.models.load_model("lstm-bidir-250words-16batch-07-0.95.keras")


In [23]:
text_input = ["""–ù–æ–≤–µ –∫–∞—Ñ–µ Cafe mariia ‚ú®
üìç–î–µ?
–ü–∞–≤–ª—ñ–≤—Å—å–∫–∞, 26/41 (10 —Ö–≤ –≤—ñ–¥ –¶–∏—Ä–∫—É)
–¢—É—Ç —à—É–º–Ω–æ, –ª—é–¥–Ω–æ —ñ –¥—É–∂–µ –∂–∏–≤–æ. –¢–∞–∫–∏–º —ñ –º–∞—î –±—É—Ç–∏ –º—ñ—Å—å–∫–µ –∫–∞—Ñ–µ.
–í—ñ–¥—á—É—Ç—Ç—è, –Ω—ñ–±–∏ –≤—Å—ñ –Ω–∞–π–ø—Ä–µ–∫—Ä–∞—Å–Ω—ñ—à—ñ –ª—é–¥–∏ –ö–∏—î–≤–∞ –∑—ñ–±—Ä–∞–ª–∏—Å—å –≤ –æ–¥–Ω–æ–º—É –º—ñ—Å—Ü—ñ. –¢–≤–æ—Ä—á—ñ, –ª–µ–≥–∫—ñ, —Ç–∞–ª–∞–Ω–æ–≤–∏—Ç—ñ, –∑ –ª–µ—Ç—é—á–∏–º —â–∞—Å–ª–∏–≤–∏–º –ø–æ–≥–ª—è–¥–æ–º —Ç–∞ –º—Ä—ñ—è–º–∏ –ø—Ä–æ –≤–µ–ª–∏–∫–µ –≤ —Ä–æ–∑–º–æ–≤–∞—Ö. –ù–∞–ø—Ä–æ—á—É–¥ –ø—Ä–µ–∫—Ä–∞—Å–Ω–æ –ú–∞—Ä—ñ—ó –≤–¥–∞–ª–æ—Å—å –∑—ñ–±—Ä–∞—Ç–∏ –≤–ª–∞—Å–Ω–µ –∫–æ–º‚Äô—é–Ω—ñ—Ç—ñ, –ø—Ä–æ —è–∫–µ, –º–∞–±—É—Ç—å, –º—Ä—ñ—è–≤ –±–∏ –∫–æ–∂–Ω–∏–π –∑–∞–∫–ª–∞–¥‚ú®
–ü—Ä–∏–º—ñ—â–µ–Ω–Ω—è —Ç–∞ –º–µ–±–ª—ñ –ø–µ—Ä–µ–π—à–ª–∏ —É —Å–ø–∞–¥–æ–∫ –≤—ñ–¥ –∫–∞—Ñ–µ –ó–æ—Ä—è, —è–∫–∞ —Ç—É—Ç –ø—Ä–æ–∂–∏–ª–∞ –º–µ–Ω—à —è–∫ —Ä—ñ–∫. –ó –Ω–æ–≤–æ–≥–æ: –∑‚Äô—è–≤–∏–ª–∞—Å—å –≤–µ–ª–∏–∫–∞ –ø–µ–∫–∞—Ä—Å—å–∫–∞ –≤—ñ—Ç—Ä–∏–Ω–∞ —Ç–∞ —Ç—Ä–æ—Ö–∏ –∑–º—ñ–Ω–∏–ª–æ—Å—å —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è –ø–æ—Å–∞–¥–∫–∏.
–ú–µ–Ω—é –ú–∞—Ä—ñ—ó ‚Äî –∫–æ–º—Ñ–æ—Ä—Ç—ñ–∫. –¢—É—Ç –ª–µ–≥–∫–æ –∑–æ—Ä—ñ—î–Ω—Ç—É–≤–∞—Ç–∏—Å—å, –±–æ –≤—Å—ñ –ø–æ–∑–∏—Ü—ñ—ó –¥–∞–≤–Ω–æ –∑–Ω–∞–π–æ–º—ñ. –ê–ª–µ –∫–ª–∞—Å–Ω–æ, —â–æ —É —Å—Ç–≤–æ—Ä–µ–Ω–Ω—ñ —Å—Ç—Ä–∞–≤ –¥–æ–ø–æ–º–∞–≥–∞–ª–∏ —à–µ—Ñ–∏ —ñ —Ç–æ–º—É –Ω–∞–≤—ñ—Ç—å –∑–≤–∏—á–Ω—ñ —Ä–µ—Ü–µ–ø—Ç–∏ –∑–∞–≥—Ä–∞–ª–∏ –Ω–æ–≤–∏–º–∏ —Ñ–∞—Ä–±–∞–º–∏. –ù–∞–ø—Ä–∏–∫–ª–∞–¥, –Ω–∞ —Å–Ω—ñ–¥–∞–Ω–æ–∫ –ø–æ–¥–∞—é—Ç—å —Ç–µ–ª—è—á–∏–π —è–∑–∏–∫, –∞ –≤–≤–µ—á–µ—Ä—ñ –≥–æ—Ç—É—é—Ç—å –∫—Ä—É–¥–æ –∑ —Ñ–µ–Ω—Ö–µ–ª–µ–º.
"""]
opt_len = 250

# clean input message
text = clean_message(text_input)

print(text)

sequence = tokenizer.texts_to_sequences([text])
print(sequence)
embedding_text = pad_sequences(sequence,padding='post',maxlen=opt_len)
print(text_input)
print('topic is: {}'.format(class_name[np.argmax(model.predict([embedding_text]))]))

–æ–≤ –∫–∞—Ñ afe mariia n n –∞–≤–ª—ñ–≤—Å—å–∫ —Ö –∏—Ä n —É—Ç —à—É–º–Ω–æ –ª—é–¥–Ω–æ –∂–∏–≤ –∞–∫ –º–∞ –º—ñ—Å—å–∫ –∫–∞—Ñ n —ñ–¥—á—É—Ç—Ç –Ω–∞–π–ø—Ä–µ–∫—Ä–∞—Å–Ω—ñ—à –∏—î–≤ –∑ –º—ñ—Å –≤–æ—Ä—á –ª–µ–≥–∫ —Ç–∞–ª–∞–Ω–æ–≤–∏—Ç –ª–µ—Ç—é—á —â–∞—Å–ª–∏–≤ –ø–æ–≥–ª—è–¥ –º—Ä—ñ –≤–µ–ª–∏–∫ —Ä–æ–∑–º–æ–≤ –∞–ø—Ä–æ—á—É–¥ –∞—Ä—ñ –≤–¥–∞ –∑—ñ–±—Ä–∞ –∫ —é–Ω—ñ—Ç –º—Ä—ñ –∑–∞–∫–ª–∞–¥ n —Ä–∏–º—ñ—â–µ–Ω–Ω –º–µ–±–ª –ø–µ—Ä–µ–π —Å–ø–∞–¥ –∫–∞—Ñ  –ø—Ä–æ–∂ –º–µ–Ω—à –Ω–æ–≤ —è–≤ –≤–µ–ª–∏–∫ –ø–µ–∫–∞—Ä—Å—å–∫ –≤—ñ—Ç—Ä–∏–Ω –∑–º—ñ–Ω —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω –ø–æ—Å–∞–¥ n –µ–Ω –∞—Ä—ñ –∫–æ–º—Ñ–æ—Ä—Ç—ñ–∫ —É—Ç –ª–µ–≥ –∑–æ—Ä—ñ—î–Ω—Ç—É –ø–æ–∑–∏—Ü—ñ –∑–Ω–∞–π–æ–º –∫–ª–∞—Å–Ω–æ —Å—Ç–≤–æ—Ä–µ–Ω–Ω —Å—Ç—Ä–∞ –¥–æ–ø–æ–º–∞–≥–∞ —à–µ—Ñ –∑–≤–∏—á–Ω —Ä–µ—Ü–µ–ø—Ç –∑–∞–≥—Ä–∞ –Ω–æ–≤ —Ñ–∞—Ä–± –∞–ø—Ä–∏–∫–ª–∞–¥ —Å–Ω—ñ–¥–∞–Ω –ø–æ–¥–∞ —Ç–µ–ª—è—á —è–∑–∏–∫ –≤–≤–µ—á–µ—Ä –≥–æ—Ç—É –∫—Ä—É–¥ —Ñ–µ–Ω n
[[6688, 1705, 1, 56509, 9278, 9278, 1, 448, 1, 9278, 7992, 1, 1, 804, 12721, 20, 803, 1705, 9278, 1, 37019, 1, 14, 49, 1, 1163, 2765, 10516, 3337, 1109, 974, 46, 1128, 1, 19619,