In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.metrics import classification_report


# mnist = input_data.read_data_sets('./MNIST_data/', one_hot=False)

In [27]:
from scipy import sparse

def xavier_init(n_features, n_components, const=1):
    low = -const*np.sqrt(6.0/(n_features + n_components))
    high = -low
    return tf.random_uniform((n_features, n_components), minval=low, maxval=high)

def gen_batches(data, data_corrupted, batch_size, data_label=None, random=True):
    """ Divide input data into batches.

    :param data: input scipy sparse matrix / numpy array / pd DataFrame
    :param data_corrupted: input corrupted data, same type as data
    :param batch_size: size of each batch, (0,1] or integer >=1
    :param data_label: label of data, pd DataFrame / pd Series / numpy 1-d, or 2-d array

    :return: data divided into batches

    ..  note: data & data_corrupted must be of same type, but data_label can be any data type
    """
    assert batch_size > 0.
    assert data.shape[0] == data_corrupted.shape[0]
    assert type(data) == type(data_corrupted),(type(data), type(data_corrupted))
    if isinstance(data, pd.DataFrame): assert (data.index == data_corrupted.index).all()
    if data_label is not None: assert data_label.ndim == 1 or data_label.shape[1] == 1

    if batch_size < 1.: batch_size = max(round(data.shape[0] * batch_size), 1)
    batch_size = int(batch_size)

    index = list(range(0, data.shape[0]))
    if random: np.random.shuffle(index)

    for i in range(0, data.shape[0], batch_size):
        if isinstance(data, pd.DataFrame):
            batch_data = data.iloc[index[i:i + batch_size]]
            batch_data_corrupted = data_corrupted.iloc[index[i:i + batch_size]]

        else:
            batch_data = data[index[i:i + batch_size]]
            batch_data_corrupted = data_corrupted[index[i:i + batch_size]]

        if data_label is not None:
            if isinstance(data_label, pd.DataFrame) or isinstance(data_label, pd.Series):
                batch_label = data_label.iloc[index[i:i + batch_size]]
            else:
                batch_label = data_label[index[i:i + batch_size]]
            yield (batch_data, batch_data_corrupted, batch_label)

        else:
            yield (batch_data, batch_data_corrupted)
            
def salt_and_pepper_noise(X, v):
    """ Apply salt and pepper noise to data in X, in other words a fraction v of elements of X
    (chosen at random) is set to its maximum or minimum value according to a fair coin flip.
    If minimum or maximum are not given, the min (max) value in X is taken.

    :param X: array_like, Input data
    :param v: int, fraction of elements to distort

    :return: transformed data
    """
    X_noise = X.tolil(True) if not isinstance(X, np.ndarray) else X.copy()
    n_features = X.shape[1]

    mn = X.min()
    mx = X.max()

    for i, sample in enumerate(X):
        mask = np.random.randint(0, n_features, v)

        for m in mask:

            if np.random.random() < 0.5:
                X_noise[i,m] = mn
            else:
                X_noise[i,m] = mx

    return X_noise.tocsr() if not isinstance(X, np.ndarray) else X_noise

def get_sparse_ind_val_shape(sparse_m):
    """ get indices, values, shape of a sparse matrix for feeding tf sparse placeholder

    :param sparse_m: input sparse matrix

    :type any scipy sparse matrix, csr/csc/coo/lil

    :return: tuple of indices, values, shape
    """
    if not isinstance(sparse_m, sparse.csr_matrix):
        sparse_m = sparse.csr_matrix(sparse_m)
    sparse_m.sort_indices()

    sparse_m = sparse.coo_matrix(sparse_m)
    indices = np.column_stack((sparse_m.row, sparse_m.col))
    values = sparse_m.data
    shape = sparse_m.shape

    return (indices, values, shape)

In [34]:
class DenoisingAutoencoder(object):
    def __init__(self):
        self.compress_factor = 100
        self.xavier_init = 1
        self.alpha = 1
        self.learning_rate = 0.01
        self.momentum = 0.5
        self.loss_func = 'cross_entropy'
        self.num_epochs = 10
        self.corr_frac = 0.0
        self.batch_size = 10
        self.model_dir = 'dae'
        self.model_name = './mnist/'
        self.tf_saver = None

    def fit(self, train_set, validation_set=None, train_set_label=None, validation_set_label=None):
#         self.sparse_input = False if isinstance(train_set, np.ndarray) else True
        self.sparse_input = True
    
        n_features = train_set.shape[1]
        self.n_components = np.floor(n_features/self.compress_factor).astype(int)
        self._build_model(n_features)
        
        with tf.Session() as self.tf_session:
            self._initialize_tf_utilities_and_ops()
            self._train_model(train_set, validation_set, train_set_label=train_set_label, validation_set_label=validation_set_label)
            self.tf_saver.save(self.tf_session, self.model_name)

    def _initialize_tf_utilities_and_ops(self):
        init_op = tf.global_variables_initializer()
        self.tf_saver = tf.train.Saver()
        self.tf_session.run(init_op)
        
    def _train_model(self, train_set, validation_set, train_set_label, validation_set_label):
        corruption_ratio = np.round(self.corr_frac*train_set.shape[1]).astype(np.int)
        
        for i in range(self.num_epochs):
            self.train_cost_batch = [], [], []
            self.fraction_triplet_batch = []
            self.num_triplet_batch = []
            
            self._run_train_step(train_set, train_set_label, corruption_ratio, i+1)
            
    def _run_train_step(self, train_set, train_set_label, corruption_ratio, epoch):
        x_corrupted = self._corrupt_input(train_set, corruption_ratio)
        batches = [_ for _ in gen_batches(train_set, x_corrupted, self.batch_size, data_label=train_set_label)]
        
        i = 1
        for batch in batches:
            if train_set_label is not None:
                x_batch, x_corr_batch, x_batch_label = batch
            else:
                print('test')
                x_batch, x_corr_batch = batch

            if self.sparse_input:
                train_feed = {self.input_data: get_sparse_ind_val_shape(x_batch),
                              self.input_data_corr: get_sparse_ind_val_shape(x_corr_batch),
                              self.input_label: x_batch_label}
            else:
                train_feed = {self.input_data: x_batch,
                              self.input_data_corr: x_corr_batch,
                              self.input_label: x_batch_label}

            step, train_autoencoder_loss, train_triplet_loss, train_cost, fraction_triplet, num_triplet = self.tf_session.run(
                [self.train_step, self.autoencoder_loss, self.triplet_loss, self.cost, self.fraction_triplet, self.num_triplet],
                feed_dict=train_feed
            )

            self.train_cost_batch[0].append(train_cost)
            self.train_cost_batch[1].append(train_autoencoder_loss)
            self.train_cost_batch[2].append(train_triplet_loss)
            self.fraction_triplet_batch.append(fraction_triplet)
            self.num_triplet_batch.append(num_triplet)

            i += 1
            
    def _corrupt_input(self, data, v):
        x_corrupted = salt_and_pepper_noise(data, v)
#         x_corrupted = None
        return x_corrupted

    def _build_model(self, n_features):
        self.input_data, self.input_data_corr, self.input_label = self._create_placeholders()
        self.W_, self.bh_, self.bv_ = self._create_variables(n_features)
        
        self._create_encode_layer()
        self._create_decode_layer()
        self._create_cost_function_node()
        self._create_train_step_node()

    def _create_placeholders(self):
        _placeholder = tf.sparse.placeholder if self.sparse_input else tf.placeholder
        input_data = _placeholder('float', name='x-input')
        input_data_corr = _placeholder('float', name='x-input-corr')
        input_label = tf.placeholder('float', name='x-input-label')
        
        return input_data, input_data_corr, input_label
        
    def _create_variables(self, n_features):
        W_ = tf.Variable(xavier_init(n_features, self.n_components, self.xavier_init), name='enc-W')
        bh_ = tf.Variable(tf.zeros([self.n_components]), name='hidden-bias')
        bv_ = tf.Variable(tf.zeros([n_features]), name='visible-bias')
        
        return W_, bh_, bv_
    
    def _create_encode_layer(self):
        _matmul = tf.sparse.matmul if self.sparse_input else tf.matmul
        
        with tf.name_scope('encode'):
            _enc_act_func = tf.nn.tanh            
            self.encode = _enc_act_func(_matmul(self.input_data_corr, self.W_) + self.bh_) - _enc_act_func(self.bh_)
    
    def _create_decode_layer(self):
        with tf.name_scope('decode'):
            _dec_act_func = tf.nn.tanh
            self.decode = _dec_act_func(tf.matmul(self.encode, tf.transpose(self.W_)) + self.bv_)
    
    def _create_cost_function_node(self):
        with tf.name_scope('cost'):
            _triplet_loss = batch_hard_triplet_loss
            
            self.triplet_loss, data_weight, self.fraction_triplet, self.num_triplet = _triplet_loss(self.sparse_input, self.input_label, self.encode)
            self.autoencoder_loss = weighted_loss(self.sparse_input, self.input_data, self.decode, loss_func=self.loss_func, weight=data_weight)
            self.cost = self.autoencoder_loss + self.alpha*self.triplet_loss
     
    def _create_train_step_node(self):
        with tf.name_scope('train'):
            self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)

    def transform(self, data, name='train'):
        with tf.Session() as self.tf_session:
            self.tf_saver.restore(self.tf_session, self.model_name)
            
            if isinstance(data, np.ndarray):
                encoded_data = self.encode.eval({self.input_data_corr: data})
            else:
                encoded_data = self.encode.eval({self.input_data_corr: get_sparse_ind_val_shape(data)})
                
#             weights = self.W_.eval()
            
            return encoded_data

In [21]:
"""Define functions to create the triplet loss with online triplet mining."""

import tensorflow as tf


def _get_anchor_positive_triplet_mask(labels):
    """Return a 2D mask where mask[a, p] is True iff a and p are distinct and have same label.

    Args:
        labels: tf.int32 `Tensor` with shape [batch_size]

    Returns:
        mask: tf.bool `Tensor` with shape [batch_size, batch_size]
    """
    # Check that i and j are distinct
    indices_equal = tf.cast(tf.eye(tf.shape(labels)[0]), tf.bool)
    indices_not_equal = tf.logical_not(indices_equal)

    # Check if labels[i] == labels[j]
    # Uses broadcasting where the 1st argument has shape (1, batch_size) and the 2nd (batch_size, 1)
    labels_equal = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))

    # Combine the two masks
    mask = tf.logical_and(indices_not_equal, labels_equal)

    return mask


def _get_anchor_negative_triplet_mask(labels):
    """Return a 2D mask where mask[a, n] is True iff a and n have distinct labels.

    Args:
        labels: tf.int32 `Tensor` with shape [batch_size]

    Returns:
        mask: tf.bool `Tensor` with shape [batch_size, batch_size]
    """
    # Check if labels[i] != labels[k]
    # Uses broadcasting where the 1st argument has shape (1, batch_size) and the 2nd (batch_size, 1)
    labels_equal = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))

    mask = tf.logical_not(labels_equal)

    return mask


def _get_triplet_mask(labels):
    """Return a 3D mask where mask[a, p, n] is True iff the triplet (a, p, n) is valid.

    A triplet (i, j, k) is valid if:
        - i, j, k are distinct
        - labels[i] == labels[j] and labels[i] != labels[k]

    Args:
        labels: tf.int32 `Tensor` with shape [batch_size]
    """
    # Check that i, j and k are distinct
    indices_equal = tf.cast(tf.eye(tf.shape(labels)[0]), tf.bool)
    indices_not_equal = tf.logical_not(indices_equal)
    i_not_equal_j = tf.expand_dims(indices_not_equal, 2)
    i_not_equal_k = tf.expand_dims(indices_not_equal, 1)
    j_not_equal_k = tf.expand_dims(indices_not_equal, 0)

    distinct_indices = tf.logical_and(tf.logical_and(i_not_equal_j, i_not_equal_k), j_not_equal_k)

    # Check if labels[i] == labels[j] and labels[i] != labels[k]
    label_equal = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
    i_equal_j = tf.expand_dims(label_equal, 2)
    i_equal_k = tf.expand_dims(label_equal, 1)

    valid_labels = tf.logical_and(i_equal_j, tf.logical_not(i_equal_k))

    # Combine the two masks
    mask = tf.logical_and(distinct_indices, valid_labels)

    return mask


def batch_all_triplet_loss(sparse_input, input_label, encode, pos_triplets_only = False):
    """Build the triplet loss over a batch of embeddings.

    We generate all the valid triplets and average the loss over the positive ones.

    Args:
        input_label: labels of the batch, of size (batch_size,)
        encode: tensor of shape (batch_size, embed_dim)

    Returns:
        triplet_loss: scalar tensor containing the triplet loss
    """

    # Get the dot product
    dotproduct = tf.matmul(encode, tf.transpose(encode))

    # shape (batch_size, batch_size, 1)
    anchor_positive_dotproduct = tf.expand_dims(dotproduct, 2)
    assert anchor_positive_dotproduct.shape[2] == 1
    # shape (batch_size, 1, batch_size)
    anchor_negative_dotproduct = tf.expand_dims(dotproduct, 1)
    assert anchor_negative_dotproduct.shape[1] == 1

    # Compute a 3D tensor of size (batch_size, batch_size, batch_size)
    # triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k
    # Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1)
    # and the 2nd (batch_size, 1, batch_size)
    triplet_distance = - anchor_positive_dotproduct + anchor_negative_dotproduct

    # Put to zero the invalid triplets
    # (where label(a) != label(p) or label(n) == label(a) or a == p)
    valid_triplet_mask = tf.to_float(_get_triplet_mask(input_label))
    num_valid_triplets = tf.reduce_sum(valid_triplet_mask)

    # Count number of positive triplets (where triplet_distance > 0)
    pos_valid_triplet_mask = tf.to_float(tf.greater(tf.multiply(valid_triplet_mask, triplet_distance), 1e-16))
    num_pos_valid_triplets = tf.reduce_sum(pos_valid_triplet_mask)

    # Set final mask
    if pos_triplets_only:
        mask = pos_valid_triplet_mask
        num_triplet = num_pos_valid_triplets
    else:
        mask = valid_triplet_mask
        num_triplet = num_valid_triplets

    # Get final mean triplet loss over the (positive) valid triplets
    triplet_loss = - tf.log_sigmoid(-triplet_distance) * mask
    triplet_loss = tf.reduce_sum(triplet_loss) / (num_triplet + 1e-16)

    data_weight = tf.reduce_sum(mask, [1, 2]) + tf.reduce_sum(mask, [0, 1]) + tf.reduce_sum(mask, [0, 2])

    return triplet_loss, data_weight, num_pos_valid_triplets / (num_valid_triplets + 1e-16), num_pos_valid_triplets


def batch_all_triplet_loss_org(sparse_input, input_label, encode, input_data, decode, pos_triplets_only = False, autoencoder_loss_func='cross_entropy'):
    """Build the triplet loss over a batch of embeddings.

    We generate all the valid triplets and average the loss over the positive ones.

    Args:
        input_label: labels of the batch, of size (batch_size,)
        encode: tensor of shape (batch_size, embed_dim)

    Returns:
        triplet_loss: scalar tensor containing the triplet loss
    """

    # Get the dot product
    dotproduct = tf.matmul(encode, tf.transpose(encode))

    # shape (batch_size, batch_size, 1)
    anchor_positive_dotproduct = tf.expand_dims(dotproduct, 2)
    assert anchor_positive_dotproduct.shape[2] == 1
    # shape (batch_size, 1, batch_size)
    anchor_negative_dotproduct = tf.expand_dims(dotproduct, 1)
    assert anchor_negative_dotproduct.shape[1] == 1

    # Compute a 3D tensor of size (batch_size, batch_size, batch_size)
    # triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k
    # Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1)
    # and the 2nd (batch_size, 1, batch_size)
    triplet_distance = - anchor_positive_dotproduct + anchor_negative_dotproduct

    # Put to zero the invalid triplets
    # (where label(a) != label(p) or label(n) == label(a) or a == p)
    valid_triplet_mask = tf.to_float(_get_triplet_mask(input_label))
    num_valid_triplets = tf.reduce_sum(valid_triplet_mask)

    # Count number of positive triplets (where triplet_distance > 0)
    pos_valid_triplet_mask = tf.to_float(tf.greater(tf.multiply(valid_triplet_mask, triplet_distance), 1e-16))
    num_pos_valid_triplets = tf.reduce_sum(pos_valid_triplet_mask)

    # Set final mask
    if pos_triplets_only:
        mask = pos_valid_triplet_mask
        num_triplet = num_pos_valid_triplets
    else:
        mask = valid_triplet_mask
        num_triplet = num_valid_triplets

    # Get final mean triplet loss over the (positive) valid triplets
    triplet_loss = - tf.log_sigmoid(-triplet_distance) * mask
    triplet_loss = tf.reduce_sum(triplet_loss) / (num_triplet + 1e-16)

    data_weight = tf.reduce_sum(mask, [1, 2]) + tf.reduce_sum(mask, [0, 1]) + tf.reduce_sum(mask, [0, 2])

    # Autoencoder element wise cross entropy loss / mean squared loss
    _reduce_sum = tf.sparse.reduce_sum if sparse_input else tf.reduce_sum
    _to_dense = tf.sparse.to_dense if sparse_input else lambda x: x

    if autoencoder_loss_func == 'cross_entropy':
        autoencoder_loss = - tf.reduce_sum(_to_dense(input_data) * tf.log(decode+1e-16) + (1.-_to_dense(input_data)) * tf.log(1.-decode+1e-16), 1)
    elif autoencoder_loss_func == 'mean_squared':
        autoencoder_loss = tf.reduce_sum(tf.squared_difference(_to_dense(input_data),decode), 1)
    elif autoencoder_loss_func == 'cosine_proximity':
        autoencoder_loss = - tf.reduce_sum(tf.nn.l2_normalize(_to_dense(input_data),1) * tf.nn.l2_normalize(decode,1), 1)
    autoencoder_loss = tf.reduce_sum(autoencoder_loss * data_weight) / (tf.reduce_sum(data_weight) + 1e-16)
    # autoencoder_loss = tf.reduce_mean(autoencoder_loss)  # using this will make it becomes normal autoencoder loss

    return triplet_loss, autoencoder_loss, num_pos_valid_triplets / (num_valid_triplets + 1e-16), num_pos_valid_triplets


def batch_hard_triplet_loss(sparse_input, input_label, encode):
    """Build the triplet loss over a batch of embeddings.

    For each anchor, we get the hardest positive and hardest negative to form a triplet.

    Args:
        labels: labels of the batch, of size (batch_size,)
        embeddings: tensor of shape (batch_size, embed_dim)
        margin: margin for triplet loss
        squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
                 If false, output is the pairwise euclidean distance matrix.

    Returns:
        triplet_loss: scalar tensor containing the triplet loss
    """

    # Get the pairwise distance matrix
    dotproduct = tf.matmul(encode, tf.transpose(encode))

    # For each anchor, get the hardest positive (similar items with smallest dotproduct)
    # First, we need to get a mask for every valid positive (they should have same label)
    mask_anchor_positive = _get_anchor_positive_triplet_mask(input_label)
    mask_anchor_positive = tf.to_float(mask_anchor_positive)

    # We add the maximum value in each row to the invalid positives (label(a) != label(n))
    max_anchor_dotproduct = tf.reduce_max(dotproduct, axis=1, keepdims=True)
    anchor_positive_dotproduct = dotproduct + max_anchor_dotproduct * (1.0 - mask_anchor_positive)

    # shape (batch_size, 1)
    hardest_positive_dotproduct = tf.reduce_min(anchor_positive_dotproduct, axis=1, keepdims=True)
    tf.summary.scalar("hardest_positive_dotproduct", tf.reduce_mean(hardest_positive_dotproduct))

    # For each anchor, get the hardest negative (dissimilar items with largest dotproduct)
    # First, we need to get a mask for every valid negative (they should have different labels)
    mask_anchor_negative = _get_anchor_negative_triplet_mask(input_label)
    mask_anchor_negative = tf.to_float(mask_anchor_negative)

    # We add the maximum value in each row to the invalid negatives (label(a) == label(n))
    anchor_negative_dotproduct = tf.multiply(mask_anchor_negative, dotproduct)

    # shape (batch_size,1)
    hardest_negative_dotproduct = tf.reduce_max(anchor_negative_dotproduct, axis=1, keepdims=True)
    tf.summary.scalar("hardest_negative_dotproduct", tf.reduce_mean(hardest_negative_dotproduct))

    # Combine biggest d(a, p) and smallest d(a, n) into final triplet loss
    triplet_dist = tf.maximum(hardest_negative_dotproduct - hardest_positive_dotproduct, 0.0)

    triplet_count = tf.to_float(tf.greater(triplet_dist,0.0))

    data_weight = tf.squeeze(triplet_count) + \
                  tf.reduce_sum(triplet_count * tf.to_float(tf.equal(dotproduct, hardest_positive_dotproduct)),0) + \
                  tf.reduce_sum(triplet_count * tf.to_float(tf.equal(dotproduct, hardest_negative_dotproduct)), 0)

    # Get final mean triplet loss
    triplet_loss = - tf.log_sigmoid(-triplet_dist) * triplet_count
    triplet_loss = tf.reduce_sum(triplet_loss) / (tf.reduce_sum(triplet_count) + 1e-16)

    return triplet_loss, data_weight, tf.reduce_sum(triplet_count) / tf.to_float(tf.shape(input_label)[0]), tf.reduce_sum(triplet_count)


def weighted_loss(sparse_input, input_data, decode, loss_func='cross_entropy', weight=None):
    _reduce_sum = tf.sparse.reduce_sum if sparse_input else tf.reduce_sum
    _to_dense = tf.sparse.to_dense if sparse_input else lambda x: x

    if weight is None: weight = tf.ones(tf.shape(input_data)[0])

    if loss_func == 'cross_entropy':
        autoencoder_loss = - tf.reduce_sum(_to_dense(input_data) * tf.log(decode+1e-16) + (1.-_to_dense(input_data)) * tf.log(1.-decode+1e-16), 1)
    elif loss_func == 'mean_squared':
        autoencoder_loss = tf.reduce_sum(tf.squared_difference(_to_dense(input_data),decode), 1)
    elif loss_func == 'cosine_proximity':
        autoencoder_loss = - tf.reduce_sum(tf.nn.l2_normalize(_to_dense(input_data),1) * tf.nn.l2_normalize(decode,1), 1)

    autoencoder_loss = tf.reduce_sum(autoencoder_loss * weight) / (tf.reduce_sum(weight) + 1e-16)

    return autoencoder_loss

In [7]:
# mnist = input_data.read_data_sets('MNIST_data/')
# x_train, y_train = mnist.train.images, mnist.train.labels

In [8]:
import pandas as pd
df = pd.read_csv('./newsCorpora.csv', header=None, sep='\t')

In [9]:
df = df.rename(columns={1: 'title', 4: 'category'})
df = df[['title', 'category']]
# TODO: Remove stopwords from titles.
def clean(x):
    for token in x.split():
        pass
df.title = df.title.map(clean)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.title)

In [29]:
from sklearn import preprocessing

In [31]:
y = df.category.values
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=0)

In [13]:
type(x_train)

scipy.sparse.csr.csr_matrix

In [8]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           b       0.85      0.87      0.86     38269
           e       0.90      0.93      0.92     50315
           m       0.86      0.81      0.84     15061
           t       0.88      0.85      0.87     35754

   micro avg       0.88      0.88      0.88    139399
   macro avg       0.87      0.87      0.87    139399
weighted avg       0.88      0.88      0.88    139399



In [32]:
y

array([0, 0, 0, ..., 2, 2, 2])

In [33]:
dae = DenoisingAutoencoder()
dae.fit(x_train, train_set_label=y_train)

ResourceExhaustedError: OOM when allocating tensor with shape[54637,5463] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node random_uniform_2/RandomUniform (defined at <ipython-input-4-5abf60a8e94c>:4)  = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/device:CPU:0"](random_uniform/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'random_uniform_2/RandomUniform', defined at:
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\asyncio\base_events.py", line 427, in run_forever
    self._run_once()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\asyncio\base_events.py", line 1440, in _run_once
    handle._run()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-8c168856a8ee>", line 2, in <module>
    dae.fit(x_train, train_set_label=y_train)
  File "<ipython-input-5-cb710f2e1073>", line 22, in fit
    self._build_model(n_features)
  File "<ipython-input-5-cb710f2e1073>", line 85, in _build_model
    self.W_, self.bh_, self.bv_ = self._create_variables(n_features)
  File "<ipython-input-5-cb710f2e1073>", line 101, in _create_variables
    W_ = tf.Variable(xavier_init(n_features, self.n_components, self.xavier_init), name='enc-W')
  File "<ipython-input-4-5abf60a8e94c>", line 4, in xavier_init
    return tf.random_uniform((n_features, n_components), minval=low, maxval=high)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\random_ops.py", line 243, in random_uniform
    rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_random_ops.py", line 771, in random_uniform
    name=name)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 3274, in create_op
    op_def=op_def)
  File "c:\users\young.a.lee\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[54637,5463] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node random_uniform_2/RandomUniform (defined at <ipython-input-4-5abf60a8e94c>:4)  = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/device:CPU:0"](random_uniform/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [9]:
embed_train = dae.transform(x_train)
embed_test = dae.transform(x_test)

INFO:tensorflow:Restoring parameters from ./mnist/
INFO:tensorflow:Restoring parameters from ./mnist/


In [10]:
# import matplotlib.pyplot as plt
# from matplotlib import offsetbox

In [11]:
# def plot_embedding(X, y, imgs=None, title=None):
#     # Adapted from http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html
#     x_min, x_max = np.min(X, 0), np.max(X, 0)
#     X = (X - x_min) / (x_max - x_min)

#     # Plot colors numbers
#     plt.figure(figsize=(10,10))
#     ax = plt.subplot(111)
#     for i in range(X.shape[0]):
#         # plot colored number
#         plt.text(X[i, 0], X[i, 1], str(y[i]),
#                  color=plt.cm.Set1(y[i] / 10.),
#                  fontdict={'weight': 'bold', 'size': 9})

#     # Add image overlays
#     if imgs is not None and hasattr(offsetbox, 'AnnotationBbox'):
#         # only print thumbnails with matplotlib > 1.0
#         shown_images = np.array([[1., 1.]])  # just something big
#         for i in range(X.shape[0]):
#             dist = np.sum((X[i] - shown_images) ** 2, 1)
#             if np.min(dist) < 4e-3:
#                 # don't show points that are too close
#                 continue
#             shown_images = np.r_[shown_images, [X[i]]]
#             imagebox = offsetbox.AnnotationBbox(
#                 offsetbox.OffsetImage(imgs[i], cmap=plt.cm.gray_r), X[i])
#             ax.add_artist(imagebox)

#     plt.xticks([]), plt.yticks([])
#     if title is not None:
#         plt.title(title)

In [3]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')

In [13]:
clf.fit(embed_train, y_train)
y_pred = clf.predict(embed_test)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       980
           1       0.98      0.99      0.99      1135
           2       0.95      0.95      0.95      1032
           3       0.95      0.96      0.95      1010
           4       0.94      0.96      0.95       982
           5       0.95      0.95      0.95       892
           6       0.97      0.97      0.97       958
           7       0.96      0.95      0.96      1028
           8       0.95      0.94      0.95       974
           9       0.95      0.92      0.93      1009

   micro avg       0.96      0.96      0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000



In [4]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

NameError: name 'classification_report' is not defined