In [2]:
%matplotlib inline

import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'tensorflow'

In [2]:
MAX_DOCUMENT_LENGTH = 100
VOCAB_SIZE = 20000
EMBEDDING_SIZE = 50
N_FILTERS = 10
N_CLASSES = 8
WINDOW_SIZE=20
FILTER_SHAPE1 = [WINDOW_SIZE,EMBEDDING_SIZE]
FILTER_SHAPE2 = [WINDOW_SIZE,N_FILTERS] 
POOLING_WINDOW = 4
POOLING_STRIDE = 2
LEARNING_RATE = 0.05
STEPS = 200

In [3]:
df = pd.read_csv('../data/CS503/labeled_news.csv', header=None)
X, y = df[1], df[0]
y = y.apply(lambda x: x-1)
print (y.unique())

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=MAX_DOCUMENT_LENGTH)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=MAX_DOCUMENT_LENGTH)

[0 1 3 6 5 2 7 4]


In [4]:
def cnn2_model(features, target):
    target = tf.one_hot(target, N_CLASSES, 1, 0)
    word_vectors = tf.contrib.layers.embed_sequence(features, vocab_size=VOCAB_SIZE, embed_dim=EMBEDDING_SIZE, scope='words')
    word_vectors = tf.expand_dims(word_vectors, 3)
    conv1 = tf.contrib.layers.convolution2d(word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID')
    pool1 = tf.nn.max_pool(conv1, ksize=[1, POOLING_WINDOW, 1, 1], strides=[1, POOLING_STRIDE, 1, 1], padding='SAME')
    pool1 = tf.transpose(pool1, [0, 1, 3, 2])
    conv2 = tf.contrib.layers.convolution2d(pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID')
    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
    logits = tf.contrib.layers.fully_connected(pool2, N_CLASSES, activation_fn=None)
    loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
    
    train_op = tf.contrib.layers.optimize_loss(
          loss,
          tf.contrib.framework.get_global_step(),
          optimizer='Adam',
          learning_rate=LEARNING_RATE)
    
    return ({
          'class': tf.argmax(logits, 1),
          'prob': tf.nn.softmax(logits)
      }, loss, train_op)

In [5]:
classifier = tf.contrib.learn.Estimator(model_fn=cnn2_model)
# Train and predict
classifier.fit(x_train, y_train, steps=STEPS)
# Evaluate model 
y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)]
print(y_predicted)
# compare the predict label and true label
score = metrics.accuracy_score(y_test, y_predicted) 
print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faffcf03eb8>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpe1ogt2vz'}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for 

In [6]:
keras = tf.keras

model = keras.Sequential()
model.add(keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_SIZE, input_length=MAX_DOCUMENT_LENGTH))
model.add(keras.layers.Conv1D(64, EMBEDDING_SIZE, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=4))
model.add(keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(keras.layers.Dense(8, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

model.fit(x_train, y_train, epochs=3)

accr = model.evaluate(x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 51, 64)            160064    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 808       
Total params: 1,226,872
Trainable params: 1,226,872
Non

In [7]:
data_directory = '../data/glove.6B'
file_name = 'glove.6B.{}d.txt'

def create_embed_dict(embed_file):
    embed_dict = dict()
    with open(embed_file) as file:
        for line in file.readlines():
            row = line.strip().split()
            word = row[0]
            embed_vect = [float(i) for i in row[1:]]
            embed_dict[word] = embed_vect
    return embed_dict

def create_embed_matrix(embed_dict):
    embed_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
    for word, index in tokenizer.word_index.items():
        embed_vect = embed_dict.get(word, None)
        if embed_vect is not None:
            embed_matrix[index] = embed_vect
        else:
            print ('Not found:', word, embed_matrix[index])
    
    embed_matrix[0] = embed_dict['unk']
    return embed_matrix

In [14]:
def glove50d_cnn_model(features, target):
    target = tf.one_hot(target, N_CLASSES, 1, 0)
    embed_dict = create_embed_dict(data_directory + '/' + file_name.format(EMBEDDING_SIZE))
    embed_matrix = create_embed_matrix(embed_dict)
    embed_matrix = tf.cast(embed_matrix, tf.float32)
    word_vectors = tf.nn.embedding_lookup(embed_matrix, features)
    word_vectors = tf.expand_dims(word_vectors, 3)
    conv1 = tf.contrib.layers.convolution2d(word_vectors, N_FILTERS, FILTER_SHAPE1, padding='VALID')
    pool1 = tf.nn.max_pool(conv1, ksize=[1, POOLING_WINDOW, 1, 1], strides=[1, POOLING_STRIDE, 1, 1], padding='SAME')
    pool1 = tf.transpose(pool1, [0, 1, 3, 2])
    conv2 = tf.contrib.layers.convolution2d(pool1, N_FILTERS, FILTER_SHAPE2, padding='VALID')
    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
    logits = tf.contrib.layers.fully_connected(pool2, N_CLASSES, activation_fn=None)
    loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
    
    train_op = tf.contrib.layers.optimize_loss(
          loss,
          tf.contrib.framework.get_global_step(),
          optimizer='Adam',
          learning_rate=LEARNING_RATE)
    
    return ({
          'class': tf.argmax(logits, 1),
          'prob': tf.nn.softmax(logits)
      }, loss, train_op)

In [15]:
classifier = tf.contrib.learn.Estimator(model_fn=glove50d_cnn_model)
# Train and predict
classifier.fit(x_train, y_train, steps=STEPS)
# Evaluate model 
y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)]
print(y_predicted)
# compare the predict label and true label
score = metrics.accuracy_score(y_test, y_predicted) 
print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faf903bfa90>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/tmpr2wp3425'}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpr2wp3425/model.ckpt.
INFO:tensorflow:loss = 2.04163, step = 1
INFO:tensorflow:global_step/sec: 4.73515
INFO:tensorflow:loss = 1.56044, step = 101 (21.119 sec)
INFO:tensorflow:Saving checkpoints for 200 into /tmp/tmpr2wp3425