In [2]:
import string
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import os
import re
from datetime import date
from fastnumbers import isfloat, isint
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Embedding
from keras.layers import BatchNormalization, SpatialDropout1D, Conv1D, Dense, Dropout, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from keras.utils import to_categorical


rus_alphabet = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я']

def create_vocab_set():

    alphabet = (rus_alphabet + list(string.ascii_lowercase) + list(string.digits) + list(string.punctuation) + [' ', '\n'])
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix+1
    return vocab, vocab_size

def text2sequence(text, vocab):
    temp = []
    for review in text:
        temp.append([])
        for i in review:
            char = vocab.get(i,0)
            if char != 0:
                temp[-1].append(char)
    return temp

dir_train = '../data'

mappings = {
    'career': 0,
    'theory_and_practice': 1,
    'deep_learning': 2,
    'lang_python': 3,
    '_meetings': 4,
    'kaggle_crackers': 5,
    'big_data': 6,
    'lang_r': 7,
    'nlp': 8,
    'welcome': 9,
    'datasets': 10,
    'bayesian': 11
}


# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

# initialize dictionary size and maximum sentence length
MAX_SEQUENCE_LENGTH = 150

data = pd.read_csv(os.path.join(dir_train, 'train_set.csv'), usecols=range(1,11), parse_dates=['timestamp', 'thread_timestamp'])
data = data[
    data.channel.isin(['career', 'big_data', 'deep_learning', 'kaggle_crackers',
           'lang_python',  'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets']) &
    data.main_msg
]

# data_train = data.
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[['channel', 'text']]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]
val_data = val_data[~val_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels =  np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

train_text = train_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))

val_text = val_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))

vocab, vocab_size = create_vocab_set()

X_train = text2sequence(train_text, vocab)
X_val = text2sequence(val_text, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

train_labels = to_categorical(train_labels, num_classes=12)
val_labels = to_categorical(val_labels, num_classes=12)

model = load_model("../models/model_simple ohe lstm.hdf5")

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 150)          15600     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 150)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 148, 150)          67650     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 148, 150)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 148, 150)          600       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 146, 150)          67650     
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 146, 150)          0         
__________

TypeError: only length-1 arrays can be converted to Python scalars

In [3]:
preidictions = model.predict(X_val)



In [4]:
preidictions

array([[ 0.10807279,  0.09659974,  0.15809806, ...,  0.0083871 ,
         0.05055336,  0.03365131],
       [ 0.35731563,  0.13107063,  0.07431436, ...,  0.04046116,
         0.02070983,  0.00851377],
       [ 0.80647564,  0.06439281,  0.02502912, ...,  0.02641875,
         0.00402849,  0.00186706],
       ..., 
       [ 0.47686476,  0.09569091,  0.13462757, ...,  0.03887894,
         0.01436156,  0.00613013],
       [ 0.01047858,  0.5346843 ,  0.09082428, ...,  0.00151439,
         0.00382038,  0.01028831],
       [ 0.10807279,  0.09659974,  0.15809806, ...,  0.0083871 ,
         0.05055336,  0.03365131]], dtype=float32)