In [1]:
from keras.layers import Input, Dense, Embedding, merge, Convolution2D, MaxPooling2D, Dropout, concatenate
from sklearn.cross_validation import train_test_split
from keras.layers.core import Reshape, Flatten
from keras.callbacks import ModelCheckpoint
from data_helpers import load_data
from keras.models import model_from_json
from keras.optimizers import Adam
from keras.models import Model
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.utils import np_utils, to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import helpers

Using TensorFlow backend.


In [2]:
from keras.callbacks import TensorBoard  
tensorboard=TensorBoard(log_dir='./logs', write_graph=True)

In [3]:
from pygoose import *

In [4]:
project = kg.Project.discover()

In [5]:
embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')

In [6]:
X_train_descriptions = kg.io.load(project.preprocessed_data_dir + 'sequences_fasttext_train.pickle')
X_test_descriptions = kg.io.load(project.preprocessed_data_dir + 'sequences_fasttext_test.pickle')

In [7]:
X_train_titles = kg.io.load(project.preprocessed_data_dir + 'sequences_fasttext_titles_train.pickle')
X_test_titles = kg.io.load(project.preprocessed_data_dir + 'sequences_fasttext_titles_test.pickle')

In [8]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')

In [9]:
y_test = kg.io.load(project.features_dir + 'y_test.pickle')

### Preprocess labels

In [11]:
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
encoded_y_test = encoder.transform(y_test)
y_train_encoded = np_utils.to_categorical(encoded_y_train)
y_test_encoded = np_utils.to_categorical(encoded_y_test)

In [12]:
y_train_encoded.shape

(500000, 14)

## Define models

In [13]:
RANDOM_SEED = 42

In [14]:
np.random.seed(RANDOM_SEED)

In [15]:
sequence_length_descr = X_train_descriptions.shape[-1]
sequence_length_titles = X_train_titles.shape[-1]
sequence_length = sequence_length_descr + sequence_length_titles
vocabulary_size = embedding_matrix.shape[0]
embedding_dim = embedding_matrix.shape[-1]

In [16]:
print(embedding_dim, vocabulary_size, sequence_length)

300 207481 45


In [17]:
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

nb_epoch = 5
batch_size = 30
output_1_dim = 14
output_2_dim = 166

### CNN lvl1

In [20]:
# this returns a tensor

inputs_descr_1 = Input(shape=(sequence_length_descr,), dtype='int32')
inputs_titles_1 = Input(shape=(sequence_length_titles,), dtype='int32')

embedding_descr = Embedding(
        vocabulary_size,
        embedding_dim,
        weights=[embedding_matrix],
        input_length=sequence_length_descr,
        trainable=False,
    )

embedding_titles = Embedding(
        vocabulary_size,
        embedding_dim,
        weights=[embedding_matrix],
        input_length=sequence_length_titles,
        trainable=False,
    )


def conv_part(embedding_1, sequence_length):
    reshape_1 = Reshape((sequence_length, embedding_dim, 1))(embedding_1)

    conv_1_0 = Convolution2D(num_filters, filter_sizes[0], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)
    conv_1_1 = Convolution2D(num_filters, filter_sizes[1], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)
    conv_1_2 = Convolution2D(num_filters, filter_sizes[2], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)

    maxpool_1_0 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_0)
    maxpool_1_1 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_1)
    maxpool_1_2 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_2)

    merged_tensor_1 = merge([maxpool_1_0, maxpool_1_1, maxpool_1_2], mode='concat', concat_axis=1)
    flatten_1 = Flatten()(merged_tensor_1)
    return flatten_1


embedding_descr_1 = embedding_descr(inputs_descr_1)
embedding_titles_1 = embedding_titles(inputs_titles_1)


flatten_descr_1 = conv_part(embedding_descr_1, sequence_length_descr)
flatten_titles_1 = conv_part(embedding_titles_1, sequence_length_titles)
flatten_1 = concatenate([flatten_descr_1, flatten_titles_1])

dropout_1 = Dropout(drop)(flatten_1)
output_1 = Dense(output_dim=output_1_dim , activation='softmax')(dropout_1)


# flatten_descr_2 = conv_part(embedding_descr_1, sequence_length_descr)
# flatten_titles_2 = conv_part(embedding_titles_1, sequence_length_titles)

# merged = concatenate([flatten_descr_2, flatten_titles_2, output_descr_1, output_titles_1])
# dropout_2 = Dropout(drop)(merged)
# output_2 = Dense(output_dim=output_2_dim , activation='softmax')(dropout_2)

# this creates a model that includes
model = Model(input=[inputs_descr_1, inputs_titles_1], output=output_1)

checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

  name=name)


In [113]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
inputs_descr_1 (InputLayer)      (None, 30)            0                                            
____________________________________________________________________________________________________
inputs_titles_1 (InputLayer)     (None, 15)            0                                            
____________________________________________________________________________________________________
embedding_descr (Embedding)      (None, 30, 300)       62244300    inputs_descr_1[0][0]             
____________________________________________________________________________________________________
embedding_titles (Embedding)     (None, 15, 300)       62244300    inputs_titles_1[0][0]            
___________________________________________________________________________________________

In [114]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

plot_model(model, to_file='model.png', show_shapes=True)

In [77]:
model.fit([X_train_descriptions, X_train_titles], y_train_encoded, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, callbacks=[tensorboard], validation_data=([X_test_descriptions, X_test_titles], y_test_encoded))  # starts training1

  """Entry point for launching an IPython kernel.


Train on 500000 samples, validate on 216675 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f32dba515f8>

In [115]:
helpers.serialize_model(model, './CNN_lvl1_model/')

Saved model to disk


In [56]:
json_file = open('./CNN_lvl1_model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("./CNN_lvl1_model/model.h5")
print("Loaded model from disk")

  return cls(**config)


Loaded model from disk


In [57]:
model = loaded_model

In [58]:
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [59]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

plot_model(model, to_file='model.png', show_shapes=True)

In [60]:
score = model.evaluate([X_test_descriptions, X_test_titles], y_test_encoded,
                       batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 2.8114139884
Test accuracy: 0.112638748288


In [118]:
lvl1_answers_train = model.predict([X_train_descriptions, X_train_titles])

In [218]:
lvl1_answers_test = model.predict([X_test_descriptions, X_test_titles])

In [32]:
lvl1_answers_train = helpers.get_file('./CNN_lvl1_model/answers_lvl1')

In [38]:
lvl1_answers_test = helpers.get_file('./CNN_lvl1_model/answers_test_lvl1')

In [219]:
np.save('./CNN_lvl1_model/answers_test_lvl1', lvl1_answers_test)

In [33]:
np.save('./CNN_lvl1_model/answers_train_lvl1', lvl1_answers_train)

In [47]:
lvl1_answers_test = np.load('./CNN_lvl1_model/answers_test_lvl1.npy')

In [48]:
len(lvl1_answers_test)

216675

### CNN lvl2

In [37]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('none')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('none')

In [38]:
y_train_lvl2 = list(df_train['lvl2'])
y_test_lvl2 = list(df_test['lvl2'])

In [39]:
encoder_lvl2 = LabelEncoder()
encoder_lvl2.fit(y_train_lvl2)
encoded_y_train_lvl2 = encoder_lvl2.transform(y_train_lvl2)
encoded_y_test_lvl2 = encoder_lvl2.transform(y_test_lvl2)
y_train_encoded_lvl2 = np_utils.to_categorical(encoded_y_train_lvl2)
y_test_encoded_lvl2 = np_utils.to_categorical(encoded_y_test_lvl2)

In [31]:
inputs_descr_1 = Input(shape=(sequence_length_descr,), dtype='int32')
inputs_titles_1 = Input(shape=(sequence_length_titles,), dtype='int32')

embedding_descr = Embedding(
        vocabulary_size,
        embedding_dim,
        weights=[embedding_matrix],
        input_length=sequence_length_descr,
        trainable=False,
    )

embedding_titles = Embedding(
        vocabulary_size,
        embedding_dim,
        weights=[embedding_matrix],
        input_length=sequence_length_titles,
        trainable=False,
    )

def conv_part(embedding_1, sequence_length):
    reshape_1 = Reshape((sequence_length, embedding_dim, 1))(embedding_1)

    conv_1_0 = Convolution2D(num_filters, filter_sizes[0], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)
    conv_1_1 = Convolution2D(num_filters, filter_sizes[1], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)
    conv_1_2 = Convolution2D(num_filters, filter_sizes[2], embedding_dim, border_mode='valid', init='normal', activation='relu', dim_ordering='tf')(reshape_1)

    maxpool_1_0 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_0)
    maxpool_1_1 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_1)
    maxpool_1_2 = MaxPooling2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), border_mode='valid', dim_ordering='tf')(conv_1_2)

    merged_tensor_1 = merge([maxpool_1_0, maxpool_1_1, maxpool_1_2], mode='concat', concat_axis=1)
    flatten_1 = Flatten()(merged_tensor_1)
    return flatten_1


embedding_descr_1 = embedding_descr(inputs_descr_1)
embedding_titles_1 = embedding_titles(inputs_titles_1)

flatten_descr_2 = conv_part(embedding_descr_1, sequence_length_descr)
flatten_titles_2 = conv_part(embedding_titles_1, sequence_length_titles)
flatten_2 = concatenate([flatten_descr_2, flatten_titles_2])

dropout_2 = Dropout(drop)(flatten_2)

output_2 = Dense(output_dim=output_2_dim , activation='softmax')(dropout_2)

model_2 = Model(input=[inputs_descr_1, inputs_titles_1], output=output_2)

checkpoint = ModelCheckpoint('CNN-text-classification-keras/logs/weights.{epoch:03d}-{val_acc:.4f}.hdf5', 
                             monitor='val_acc', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto')

adam = Adam(lr=1e-4, 
            beta_1=0.9, 
            beta_2=0.999, 
            epsilon=1e-08)

model_2.compile(optimizer=adam, 
                loss='categorical_crossentropy', 
                metrics=['top_k_categorical_accuracy'])

  name=name)


In [32]:
model_2.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_11 (InputLayer)            (None, 30)            0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 15)            0                                            
____________________________________________________________________________________________________
embedding_11 (Embedding)         (None, 30, 300)       62244300    input_11[0][0]                   
____________________________________________________________________________________________________
embedding_12 (Embedding)         (None, 15, 300)       62244300    input_12[0][0]                   
___________________________________________________________________________________________

In [33]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

plot_model(model_2, to_file='model.png', show_shapes=True)

In [97]:
# import h5py
# filename = './CNN_lvl2_model/model.h5'
# f = h5py.File(filename, 'r')

# # List all groups
# print("Keys: %s" % f.keys())
# a_group_key = list(f.keys())

# for index,key in enumerate(a_group_key[:10]):
#     print(index, key)
#     data = np.array(f[key].values())
    
# list(f['flatten_descr_1_conv_1_0'].values())

Keys: KeysView(<HDF5 file "model.h5" (mode r)>)


In [None]:
model_2.fit([X_train_descriptions, X_train_titles], y_train_encoded_lvl2, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, callbacks=[tensorboard], validation_data=([X_test_descriptions, X_test_titles], y_test_encoded_lvl2))  # starts training1

  """Entry point for launching an IPython kernel.


Train on 500000 samples, validate on 216675 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [42]:
weights = model_2.get_weights()

In [44]:
helpers.save_file(weights, './CNN_lvl2_model/weights_final')

In [45]:
model_json = model_2.to_json()
with open("./CNN_lvl2_model/model_final.json", "w") as json_file:
    json_file.write(model_json)

### Save

In [189]:
helpers.serialize_model(model_2, './CNN_lvl2_model/')

Saved model to disk


In [77]:
score = loaded_model.evaluate([X_test_descriptions, X_test_titles, lvl1_answers_test], y_test_encoded_lvl2,
                       batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])


Test score: 5.50227687247
Test accuracy: 0.0356801676875


In [213]:
encoder.classes_

array([  1,   3,   4,   5,   6,   7,   8,   9,  27,  40,  47,  59, 110, 140])

In [212]:
encoder_lvl2.classes_

array([ 11,  12,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  29,  30,  31,  33,  34,  35,  36,  37,  38,  40,  42,
        43,  44,  45,  46,  51,  53,  55,  56,  57,  60,  61,  62,  64,
        65,  66,  67,  70,  71,  72,  73,  74,  75,  76,  78,  79,  80,
        81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,
        94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
       134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147,
       148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
       162, 165, 166, 167, 168, 169, 172, 249, 250, 251, 252, 253, 254,
       255, 256, 257, 258, 259, 265, 266, 267, 268, 269, 270, 272, 273,
       274, 275, 278, 279, 280, 281, 282, 283, 284, 285])

In [193]:
lvl2_answers_test = loaded_model.predict([X_test_descriptions, X_test_titles, lvl1_answers_test])

In [211]:
np.save('./CNN_lvl2_model/lvl2_answers_test', lvl1_answers_test)

In [208]:
def top_3_accuracy(y_test, predict_proba):
    predictions = helpers.get_prediction_with_precision(encoder_lvl2.classes_, predict_proba, 1, True)
    answer = [1 if y_test[i] in predictions[i] else 0 for i in range(len(predictions))]
    return answer

In [209]:
answer = top_3_accuracy(y_test_lvl2, lvl2_answers_test)

In [210]:
sum(answer)/len(answer)

0.861271489558094

### Evaluate model

In [53]:
score = model.evaluate(X_test, y_test_encoded,
                       batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])


Test score: 0.2452411565
Test accuracy: 0.935054794832


In [56]:
predicted = model.predict(X_test)

In [61]:
np.save('../data/features/CNN_lvl1_proba_test', predicted)

In [62]:
def get_prediction_with_precision(classes, predict_proba, cat_num=3, with_proba=False):
    predictions_proba = predict_proba
    return [
        [
            (
                classes[pos], proba[pos]
            )
            for pos, proba in sorted(
                enumerate(predictions),
                key=lambda arg: arg[1], reverse=True
            )[:cat_num]
        ]
        if with_proba is False
        else
        [
            (
                classes[pos]
            )
            for pos, proba in sorted(
                enumerate(predictions),
                key=lambda arg: arg[1], reverse=True
            )[:cat_num]
        ]
        for i, predictions in enumerate(predictions_proba)
    ]

In [65]:
classes = encoder.classes_

In [67]:
predicted = get_prediction_with_precision(classes, predicted, 1, True)

In [68]:
predicted = [item for sublist in predicted for item in sublist]

In [75]:
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          1       0.92      0.94      0.93     15503
          3       0.90      0.89      0.89     15483
          4       0.87      0.90      0.88     21531
          5       0.98      0.97      0.98     56622
          6       0.98      0.96      0.97     24020
          7       0.90      0.87      0.89       882
          8       0.75      0.75      0.75      2660
          9       0.98      0.98      0.98     16454
         27       0.91      0.93      0.92     12137
         40       0.76      0.27      0.40       239
         47       0.98      0.96      0.97     28935
         59       0.85      0.76      0.81      3649
        110       0.88      0.97      0.93     11497
        140       0.72      0.69      0.70      7063

avg / total       0.94      0.94      0.93    216675



In [78]:
M = confusion_matrix(y_test, predicted)

In [79]:
print(M)

[[14501   551   127   103    63     1    28    16    20     0     2    13
     32    46]
 [  670 13738   478    78    73     1   127     9    46     0    12     5
     44   202]
 [  143   447 19293   196   176    32    99    75   324     1    31    63
     76   575]
 [   83    79   303 55144    27     4    98    20   269     1    10   297
    104   183]
 [  195   110   319    37 23065     2    37    53    16     1     7    17
     42   119]
 [    5     1    49    13     2   766     0    12     9     0     0     3
      8    14]
 [   27   136   165    69    20     3  1998     5    41     3    12    20
     50   111]
 [    8     8    95    23    19     3     2 16148     1     0     7     1
     45    94]
 [   23    24   282   180    17     1    53    12 11333    14     9    33
     35   121]
 [    1     3    14     1     0     0    91     0    56    65     1     2
      0     5]
 [    4     8    48    14    10     4    13    37    19     0 27713     2
    763   300]
 [   34    26   191  

### serialize model to JSON

In [54]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


### load model

In [None]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))