In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers, losses, metrics, callbacks, datasets
from tensorflow.keras.utils import plot_model
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from joblib import dump, load
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
seed=42
tf.random.set_seed(seed)



## using neural net on already labelled data

In [2]:
fashion_mnist = keras.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()

In [4]:
X_valid, X_train = X_train_full[:5000] / 255., X_train_full[5000:] / 255.
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test / 255.

In [5]:
X_train_full[0]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
          0,   0,  13,  73,   0,   0,   1,   4,   0,   0,   0,   0,   1,
          1,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,
          0,  36, 136, 127,  62,  54,   0,   0,   0,   1,   3,   4,   0,
          0,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   6,
          0, 102, 204, 176, 134, 144, 123,  23,   0,   0,   0,   0,  12,
         10,   0],
       [  

In [6]:
X_train[0].shape

(28, 28)

In [5]:
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [6]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

In [7]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [8]:
history = model.fit(X_train, y_train, epochs=30,
                    validation_data=(X_valid, y_valid))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
model.evaluate(X_test, y_test)



[0.33530178666114807, 0.882099986076355]

In [10]:
y_proba = model.predict(X_test)



In [11]:
np.argmax(y_proba, axis=-1)

array([9, 2, 1, ..., 8, 1, 5], dtype=int64)

In [12]:
y_test

array([9, 2, 1, ..., 8, 1, 5], dtype=uint8)

#### we will be using semi supervised learning where i will take unlabelled data and label the data using the clustering on the subset of data then will use the supervised learning get the label on all the data. we will validate the accuracy of neural network by checking on already labeled untrained data

## Let's Use Clustering for label (K mean clustering)

In [13]:
## take only X_train dataset and take their subset to make label

X_train_subset= X_train[:40000]
X_train_subset_flatten= X_train_subset.reshape(X_train[:40000].shape[0],-1)
## using clustering on the this subset
kmeans = KMeans(init='k-means++', n_clusters=50, random_state=0).fit(X_train_subset_flatten)



In [14]:
label=kmeans.labels_  ## label of the subset data

In [15]:
## doing test train split for validation
X_train_full_new, X_test_new= X_train_subset[:35000],X_train_subset[35000:]
Y_train_full_new, Y_test_new= label[:35000], label[35000:]

In [16]:
X_valid_new, X_train_new = X_train_full_new[:5000], X_train_full_new[5000:]
Y_valid_new, Y_train_new = Y_train_full_new[:5000], Y_train_full_new[5000:]

In [17]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(50, activation="softmax")
])

In [18]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [19]:
history = model.fit(X_train_new, Y_train_new, epochs=30,
                    validation_data=(X_valid_new, Y_valid_new))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [20]:
model.evaluate(X_test_new, Y_test_new)  



[0.1952732354402542, 0.9193999767303467]

### acheived a accuracy of 91% by doing labeling on unlabeled data with 50 cluster

In [21]:
## acheive a accuracy of 91% with 

In [22]:
y_proba_new = model.predict(X_test_new)



In [23]:
np.argmax(y_proba_new, axis=-1)

array([47, 10, 25, ..., 39,  5, 39], dtype=int64)

In [24]:
Y_test_new

array([47, 10, 25, ..., 39,  5, 28])

In [25]:
## getting label on data which was not in our subset
label_rest= model.predict(X_train[40000:])



## experimenting with different value of cluster
### Taking cluster of 30

In [26]:

X_train_subset_30= X_train[:40000]
X_train_subset_flatten_30= X_train_subset_30.reshape(X_train[:40000].shape[0],-1)
## using clustering on the this subset
kmeans = KMeans(init='k-means++', n_clusters=30, random_state=0).fit(X_train_subset_flatten_30)



In [27]:
label_30=kmeans.labels_  ## label of the subset data with 30 cluster

In [28]:
## doing test train split for validation
X_train_full_new_30, X_test_new_30= X_train_subset_30[:35000],X_train_subset_30[35000:]
Y_train_full_new_30, Y_test_new_30= label_30[:35000], label_30[35000:]

In [29]:
X_valid_new_30, X_train_new_30 = X_train_full_new_30[:5000], X_train_full_new_30[5000:]
Y_valid_new_30, Y_train_new_30 = Y_train_full_new_30[:5000], Y_train_full_new_30[5000:]

In [30]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(30, activation="softmax")
])

In [31]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [32]:
history = model.fit(X_train_new_30, Y_train_new_30, epochs=30,
                    validation_data=(X_valid_new_30, Y_valid_new_30))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [33]:
model.evaluate(X_test_new_30, Y_test_new_30)  



[0.15311087667942047, 0.9362000226974487]

In [34]:
## getting label on data which was not in our subset
label_rest= model.predict(X_train[40000:])



## after taking a value of K=30 achieve a accuracy of 94%

## Taking K=10

In [35]:

X_train_subset_10= X_train[:40000]
X_train_subset_flatten_10= X_train_subset_10.reshape(X_train[:40000].shape[0],-1)
## using clustering on the this subset
kmeans = KMeans(init='k-means++', n_clusters=10, random_state=0).fit(X_train_subset_flatten_10)



In [36]:
label_10=kmeans.labels_  ## label of the subset data with 30 cluster

In [37]:
## doing test train split for validation
X_train_full_new_10, X_test_new_10= X_train_subset_10[:35000],X_train_subset_10[35000:]
Y_train_full_new_10, Y_test_new_10= label_10[:35000], label_10[35000:]

In [38]:
X_valid_new_10, X_train_new_10 = X_train_full_new_10[:5000], X_train_full_new_10[5000:]
Y_valid_new_10, Y_train_new_10 = Y_train_full_new_10[:5000], Y_train_full_new_10[5000:]

In [39]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

In [40]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [41]:
history = model.fit(X_train_new_10, Y_train_new_10, epochs=30,
                    validation_data=(X_valid_new_10, Y_valid_new_10))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [42]:
model.evaluate(X_test_new_10, Y_test_new_10)  



[0.0942617803812027, 0.9581999778747559]

In [43]:
## getting label on data which was not in our subset
label_rest= model.predict(X_train[40000:])



## Taking K=100

In [44]:

X_train_subset_100= X_train[:40000]
X_train_subset_flatten_100= X_train_subset_100.reshape(X_train[:40000].shape[0],-1)
## using clustering on the this subset
kmeans = KMeans(init='k-means++', n_clusters=100, random_state=0).fit(X_train_subset_flatten_100)



In [45]:
label_100=kmeans.labels_  ## label of the subset data with 100 cluster

In [46]:
## doing test train split for validation
X_train_full_new_100, X_test_new_100= X_train_subset_100[:35000],X_train_subset_100[35000:]
Y_train_full_new_100, Y_test_new_100= label_100[:35000], label_100[35000:]

In [47]:
X_valid_new_100, X_train_new_100 = X_train_full_new_100[:5000], X_train_full_new_100[5000:]
Y_valid_new_100, Y_train_new_100 = Y_train_full_new_100[:5000], Y_train_full_new_100[5000:]

In [48]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(200, activation="relu"),
    keras.layers.Dense(100, activation="softmax")
])

In [49]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])

In [50]:
history = model.fit(X_train_new_100, Y_train_new_100, epochs=30,
                    validation_data=(X_valid_new_100, Y_valid_new_100))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [51]:
model.evaluate(X_test_new_100, Y_test_new_100)  



[0.27845636010169983, 0.8920000195503235]

### with cluster of 100 we will get the accuarcy of 88.86%

In [52]:
## getting label on data which was not in our subset
label_rest= model.predict(X_train[40000:])



## conclusion

#### if we decrease the cluster size accuracy will increase but no of labels will decrease which will not be a good classification as it will classify two different object in same label, but if we incrase the label I.e. the cluster size then accuracy of classification in different label will decrease 