In [1]:
import pandas as pd

import json
import string
import spacy

from tensorflow import keras
from keras_preprocessing.text import tokenizer_from_json

import data_analysis

### Load input data

In [13]:
df_input = pd.read_json('data_short.json')
df_input

Unnamed: 0,title,text
50,The Darkling Thrush,I leant upon a coppice gate \n When Frost w...
22,For a Daughter Who Leaves,"""More than gems in my comb box shaped by the\n..."
20,Fons,"Reanimated, spirit restored, \nreincorporated,..."
209,"Howl, Parts I & II",For Carl Solomon\n I\n I saw the best minds ...
23,The Black Bass,"My hand became my father's hand \nthat day, \n..."


### Preprocessing input data

In [4]:
def full_form(word):
    if word == 'nt': word = 'not'
    if word == 're': word = 'are'
    if word == 's': word = 'is'
    if word == 'd': word = 'would'
    if word == 'll': word = 'will'
    if word == 't': word = 'not'
    if word == 've': word = 'have'
    if word == 'm': word = 'am'
    return word

def to_lower_case(df_input):
    #  change the texts to lowercase
    df_input['text'] = df_input['text'].str.lower()
    df_input['title'] = df_input['title'].str.lower()
    
    return df_input
    
def rem_punctuation(df_input):
    #  Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    df_input['text'] = [row['text'].translate(table) for index, row in df_input.iterrows()]
    df_input['title'] = [row['title'].translate(table) for index, row in df_input.iterrows()]
    
    return df_input

def lemmatize(df_input):
    #  Lemmatization
    nlp = spacy.load("en_core_web_sm")

    df_input['text'] = [
                    [token.lemma_ for token in nlp(row['text'])]
                    for index, row in df_input.iterrows()
                 ]
    df_input['title'] = [
                    [token.lemma_ for token in nlp(row['title'])]
                    for index, row in df_input.iterrows()
                 ]
    
    return df_input
    

def tokenize(df_input):
    #Load tokenizator
    with open('tokenizer.json') as f: 
        data_tok = json.load(f) 
        tok = tokenizer_from_json(data_tok)
    
    df_input['text'] = tok.texts_to_sequences(df_input['text'])
    df_input['title'] = tok.texts_to_sequences(df_input['title'])
    
    return df_input
    

def preprocess_input(df_input):
    #  set up data types
    df_input = df_input.astype({'text': 'str'})

    df_input = to_lower_case(df_input)

    df_input = rem_punctuation(df_input)
    
    #  Remove stopwords
    from spacy.lang.en.stop_words import STOP_WORDS
    df_input['text'] = df_input['text'].apply(lambda x: " ".join(x for x in x.split() if x not in STOP_WORDS))
    
    df_input = lemmatize(df_input)
    
    df_input['text'] = [
                [full_form(w) for w in row['text']]
                for index, row in df_input.iterrows()
             ]
    
    df_input = tokenize(df_input)
        
        
    max_len = 150  # max length of string
    joined_text = df_input['title'] + df_input['text']
    X = keras.preprocessing.sequence.pad_sequences(list(joined_text), maxlen=max_len, padding='post')
    
    return X

In [5]:
df_input_prp = preprocess_input(df_input)

### Make predictions

In [11]:
def load_models():
    themes_to_predict = ['nature', 'family', 'love', 'body', 'animals']
    
    bin_models = {}
    for theme in themes_to_predict:
        bin_models['model_' + theme] = keras.models.load_model('baby_models/model_' + theme + '.h5')
        
    return bin_models


def predict(df_input_prp):
    themes_to_predict = ['nature', 'family', 'love', 'body', 'animals']
    bin_models = load_models()
    
    models_predictions = {}
    for theme in themes_to_predict:
        bin_mod = bin_models['model_' + theme]

        predictions = bin_mod.predict(df_input_prp)
        models_predictions['model_' + theme] = predictions
        
    return models_predictions

In [12]:
predict(df_input_prp)

{'model_nature': array([[0.13374472],
        [0.05692529],
        [0.2049426 ],
        [0.02594746],
        [0.09037229]], dtype=float32),
 'model_family': array([[0.11155637],
        [0.13056737],
        [0.11589477],
        [0.07769781],
        [0.16128244]], dtype=float32),
 'model_love': array([[0.06430162],
        [0.05631455],
        [0.12095229],
        [0.16223776],
        [0.04192364]], dtype=float32),
 'model_body': array([[0.03635923],
        [0.10865979],
        [0.07035328],
        [0.07772031],
        [0.32045162]], dtype=float32),
 'model_animals': array([[0.07946671],
        [0.09269187],
        [0.15948144],
        [0.0443071 ],
        [0.0771794 ]], dtype=float32)}