In [1]:
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import tensorflow as tf
import tensorflow_datasets as tfds
from keras.datasets import imdb

In [2]:

from sklearn.utils import resample
(x_train, y_train), (x_test, y_test)= imdb.load_data(num_words =10000)
x_unlabeled = resample(x_train, n_samples=15000, replace=False, stratify=x_train, random_state=0)


from keras.preprocessing.text import Tokenizer

max_words = 10000
num_classes = max(y_train) + 1

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
x_unlabeled_ = tokenizer.sequences_to_matrix(x_unlabeled, mode='binary')

y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

print(x_train[0])
print(len(x_train[0]))

print(y_train[0])
print(len(y_train[0]))


#for embedding purposes makes sure all samples are of the same length
from keras import preprocessing
maxlen = 1000 #max 100 words per input
x_train_ =tf.keras.utils.pad_sequences(x_train, maxlen = maxlen)
x_test_ = tf.keras.utils.pad_sequences(x_test, maxlen = maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[0. 1. 1. ... 0. 0. 0.]
10000
[0. 1.]
2


In [6]:
from keras.metrics import categorical_accuracy, categorical_crossentropy
from keras.utils import to_categorical

class CustomCallback(tf.keras.callbacks.Callback):

  def __init__(self):
    self.alpha_w = 0.0
    self.n_classes=2

  def on_epoch_end(self, epoch, logs):
    if epoch < 10:
        self.alpha_w = 0.0
    elif epoch >= 70:
        self.alpha_w = 3.0
    else:
        self.alpha_w = (epoch - 10.0) / (70.0-10.0) * 3.0
    
    #coefs =(1+ self.alpha_w)
    #logs["loss"] = logs["loss"]*coefs

  def loss_function(self, y, y_pred):

    #y_batch, np.repeat(0.0, y_batch.shape[0])
    #cce = tf.keras.losses.SparseCategoricalCrossentropy()
    #y = np.c_[y, np.repeat(0.0, y.shape[0])]
    #y_true_item = y[:, :self.n_classes]
    #unlabeled_flag = y[:, self.n_classes]
    """
    from_logits: Whether to interpret `y_pred` as a tensor of
            [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
            assume that `y_pred` contains probabilities (i.e., values in [0,
            1]).
    """
    entropies = categorical_crossentropy(y, y_pred)
    coefs = 1 + self.alpha_w
    self.loss = coefs * entropies
    #1.0-unlabeled_flag + self.alpha_w * unlabeled_flag # 1 if labeled, else alpha_t
    return coefs * entropies

  def accuracy(self, y, y_pred):

    y_true_item = y[:, :self.n_classes]
    return categorical_accuracy(y_true_item, y_pred)
  

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
filepath = "/content/gdrive/MyDrive/den_yelp_notebooks/weights.hdf5"

In [7]:
def keras_model():

  model = Sequential()
  model.add(tf.keras.layers.Embedding(maxlen, 5))
  model.add(tf.keras.layers.LSTM(5))

  model.add(Dense(512, input_shape=(maxlen,), activation = "relu"))
  #dropping with prob. 0.5
  model.add(Dropout(0.5))
  model.add(Dense(num_classes, activation = "softmax"))
  # CustomCallback
  #model.compile("adam", loss=pseudo.loss_function, metrics=[pseudo.accuracy])
  #model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=["acc"] )
  #model.summary()
  return model

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
call_back = CustomCallback()
model = keras_model()
model.compile("adam", loss=call_back.loss_function, metrics=[call_back.accuracy])

In [9]:
for i in range(5):
  #for every odd i fir with labeled data
  if i%2 == 0:
    model.fit(x_train_, y_train, epochs = 100, verbose =2, batch_size = 700)
  else:
    x_unlabeled_sam = resample(x_unlabeled_, n_samples=1000, replace=False, stratify=x_unlabeled, random_state=0)
    pseudo_labels = model.predict(x_unlabeled_sam)
    pseudo_labels_ =np.argmax(pseudo_labels, axis=-1,).reshape(-1, 1)
    pseudo_labels_ = tf.keras.utils.to_categorical(pseudo_labels_, num_classes)
    history = model.fit(x_unlabeled_sam, pseudo_labels_, epochs = 30, verbose =2, \
                        callbacks=[call_back], batch_size = 500, \
                        validation_data=(x_test_[:700], y_test[:700]) )
  

Epoch 1/100
36/36 - 31s - loss: 0.6934 - accuracy: 0.4924 - 31s/epoch - 854ms/step
Epoch 2/100
36/36 - 29s - loss: 0.6932 - accuracy: 0.4959 - 29s/epoch - 797ms/step
Epoch 3/100
36/36 - 28s - loss: 0.6932 - accuracy: 0.4988 - 28s/epoch - 777ms/step
Epoch 4/100
36/36 - 29s - loss: 0.6932 - accuracy: 0.5011 - 29s/epoch - 795ms/step
Epoch 5/100
36/36 - 28s - loss: 0.6932 - accuracy: 0.5006 - 28s/epoch - 769ms/step
Epoch 6/100
36/36 - 28s - loss: 0.6933 - accuracy: 0.4977 - 28s/epoch - 781ms/step
Epoch 7/100
36/36 - 27s - loss: 0.6931 - accuracy: 0.5050 - 27s/epoch - 753ms/step
Epoch 8/100
36/36 - 27s - loss: 0.6932 - accuracy: 0.4964 - 27s/epoch - 743ms/step
Epoch 9/100
36/36 - 27s - loss: 0.6932 - accuracy: 0.4978 - 27s/epoch - 758ms/step
Epoch 10/100
36/36 - 27s - loss: 0.6932 - accuracy: 0.4957 - 27s/epoch - 764ms/step
Epoch 11/100
36/36 - 28s - loss: 0.6932 - accuracy: 0.4956 - 28s/epoch - 778ms/step
Epoch 12/100
36/36 - 28s - loss: 0.6932 - accuracy: 0.4975 - 28s/epoch - 775ms/step
E

KeyboardInterrupt: ignored

In [None]:
h_d = history.history
loss_values = h_d['loss']
val_loss_values = h_d["val_loss"]
epochs = range(1, len(h_d['accuracy'])+1)

plt.plot(epochs, loss_values, 'bo', label = "Training Loss")
plt.plot(epochs, val_loss_values, 'b', label="Validation Loss")
plt.title("Training and Validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()