<a href="https://colab.research.google.com/github/rajvirvyas/Senior-Project/blob/main/Image_Processing_PRETRAINED_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNN approach using Kaggle Dataset**

First, I will need to use data generators to load images from the directories for training, testing and validation.

In [1]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGLE_USR"] = userdata.get('KAGGLE_USR')

In [2]:
!kaggle datasets download -d marcozuppelli/stegoimagesdataset

! unzip -qq "stegoimagesdataset.zip"

Dataset URL: https://www.kaggle.com/datasets/marcozuppelli/stegoimagesdataset
License(s): DbCL-1.0
Downloading stegoimagesdataset.zip to /content
100% 1.51G/1.51G [00:13<00:00, 210MB/s]
100% 1.51G/1.51G [00:13<00:00, 118MB/s]


In [6]:
import numpy as np
import cv2
from PIL import Image
import random

def logistic_map(x,r):
    return r * x * (1-x)

def generate_time_series(size,r=5.2, seed=0.5):
    x = seed # common seed
    time_series = [x]

    for _ in range(size - 1):
        x = logistic_map(x,r) # r can be chosen to be any num
        x = (x- np.floor(x)) #limit between 0 and 1
        time_series.append(x)

    return np.array(time_series)

def apply_noise(binary_secret, noisy_time_series):
    blist = list(binary_secret)
    binary_length = len(blist)

    for i in range (binary_length):
        bit = int(blist[i])
        magic_num = noisy_time_series[i % len(noisy_time_series)]
        bit = bit ^ int(magic_num * 2) #xor
        blist[i] = str(bit)

    return "".join(blist)

def extract_noise(imagepath, size):
    image = Image.open(imagepath)
    width, height = image.size
    noisy_secret = ''
    n = 0

    for i in range(0, width):
        for j in range(0, height):
            pixel = list(image.getpixel((i, j)))
            for val in range(0, 3):
                if n < size:
                    noisy_secret += str(pixel[val] & 1)
                    n += 1
                else:
                    break
            if n >= size:
                break
        if n >= size:
            break

    return noisy_secret.zfill(size)


In [7]:
def add_chaotic_noise_to_dataset(stego_image):
  lsb_data = ''.join(str(pixel & 1) for pixel in stego_image.flat)
  noisy_time_series = generate_time_series(len(lsb_data))
  noisy_lsb_data = apply_noise(lsb_data, noisy_time_series)

  noisy_stego_image = stego_image.copy() #put noisy data into image again
  for i in range(len(noisy_lsb_data)):
    noisy_stego_image.flat[i] = (noisy_stego_image.flat[i] & ~1) | int(noisy_lsb_data[i])

  return noisy_stego_image

Ok, now that I have functions for doing all this fuss, lets run the noisy embedding on the kaggle dataset

In [8]:
import os
from tqdm import tqdm

def convert_to_noisy_embeddings(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for filename in tqdm(os.listdir(input_dir), desc="Processing images"):
        if filename.endswith(".png"):
            image_path = os.path.join(input_dir, filename)
            stego_image = cv2.imread(image_path)

            noisy_stego_image = add_chaotic_noise_to_dataset(stego_image)
            output_path = os.path.join(output_dir, filename)
            cv2.imwrite(output_path, noisy_stego_image)


In [9]:
convert_to_noisy_embeddings('/content/train/train/stego', '/content/train/train/noisy_stego')

Processing images:   0%|          | 20/12000 [02:15<22:34:38,  6.78s/it]


KeyboardInterrupt: 

In [20]:
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import numpy as np


def make_data_generators(train_dir, val_dir, batch_size=32,epochs=10):
  train_datagen = ImageDataGenerator(rescale= 1./255)  # applied normalization of pixel values
  train_generator = train_datagen.flow_from_directory(
      '/content/train/train', #training directory
      target_size= (128, 128),
      batch_size= batch_size,
      class_mode= 'binary'
  )

  validation_datagen = ImageDataGenerator(rescale= 1./255)  # applied normalization of pixel values
  validation_generator = validation_datagen.flow_from_directory(
      '/content/val', #validation directory
      target_size= (128, 128),
      batch_size= batch_size,
      class_mode= 'binary'
  )

  # testing_datagen = ImageDataGenerator(rescale= 1./255)  # applied normalization of pixel values
  # testing_generator = testing_datagen.flow_from_directory(
  #     '', #training directory
  #     target_size= (128, 128),
  #     batch_size= 32,
  #     class_mode= 'binary'
  # )
  return train_generator, validation_generator

def get_class_weights(train_generator):
  classes = np.array([0] * train_generator.classes.shape[0])
  for i in range(len(train_generator.classes)):
    classes[i] = train_generator.classes[i]
  class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(classes), y=classes)
  class_weights = dict(enumerate(class_weights))
  return class_weights

def training_steganography_model(model, train_dir, val_dir, epochs=10):
  early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
  lr_reducer = ReduceLROnPlateau(factor=0.2, patience=3, min_lr=0.00001)

  train_generator, validation_generator = make_data_generators(train_dir, val_dir, epochs)
  class_weights = get_class_weights(train_generator)

  history =model.fit(
      train_generator,
      epochs=epochs,
      validation_data = validation_generator,
      callbacks=[early_stop, lr_reducer],
      class_weight=class_weights
  )
  return history

def fine_tuning_model(model, train_generator, validation_generator, epochs=10):
  for layer in model.layers[-15:]:
    layer.trainable = True
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])

  history = model.fit(
      train_generator,
      epochs=epochs,
      validation_data = validation_generator,
      callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
  )
  return history



ok lets try using resnet50

In [17]:
from tensorflow.keras.applications import ResNet50, VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils import class_weight
from tensorflow.keras.optimizers import Adam

def build_classificaton_model(model_type,shape):
    if model_type == 'resnet50':
      base_model = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, input_shape=shape)
    else:
      base_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=shape)

    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    predictions = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [18]:
model_type = 'resnet50'
model = build_classificaton_model(model_type,(128,128,3))
#model.summary()
train_generator, validation_generator = make_data_generators('/content/train/train', '/content/val', batch_size=16, epochs=10)
history = training_steganography_model(model, train_generator, validation_generator, epochs=10)



Found 16000 images belonging to 2 classes.
Found 8000 images belonging to 1 classes.
Found 16000 images belonging to 2 classes.
Found 8000 images belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


AttributeError: 'Functional' object has no attribute 'layer'

In [21]:
finetuned = fine_tuning_model(model, train_generator, validation_generator, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [22]:
model_type = 'vgg16'
model = build_classificaton_model(model_type,(128,128,3))
#model.summary()
train_generator, validation_generator = make_data_generators('/content/train/train', '/content/val', batch_size=16, epochs=10)
history = training_steganography_model(model, train_generator, validation_generator, epochs=10)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 16000 images belonging to 2 classes.
Found 8000 images belonging to 1 classes.
Found 16000 images belonging to 2 classes.
Found 8000 images belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
finetuned = fine_tuning_model(model, train_generator, validation_generator, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


**DO NOT USE THE FOLLOWING CODE**

In [12]:
from keras import layers,models

def CNN_algorithm(shape):
  model = models.Sequential()
  model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=shape))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Conv2D(64, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Conv2D(128, (3, 3), activation='relu'))
  model.add(layers.MaxPooling2D((2, 2)))
  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))
  #model.summary()
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

model=CNN_algorithm((128,128,3))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 126, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 30, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 28, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 14, 14, 128)       0

In [None]:
history =model.fit(
    train_generator,
    epochs=10,
    validation_data = validation_generator
)

Epoch 1/10
Epoch 2/10
Epoch 3/10


I observed that accuracy doesn't really change and my val_accuracy stays 0. I think it might be worth looking into a different number of layers, or different optimizer or learning rate as well. Also, maybe I need to shuffle the dataset?

In [None]:


def count_images(directory):
    count = 0
    for subdir, _, files in os.walk(directory):
        count += len(files)
    return count

print("Training cover images:", count_images(os.path.join('/content/train/train', 'clean')))
print("Training stego images:", count_images(os.path.join('/content/train/train', 'stego')))
#print("Validation cover images:", count_images(os.path.join(val_dir, 'cover')))
#print("Validation stego images:", count_images(os.path.join(val_dir, 'stego')))


Training cover images: 4000
Training stego images: 12000


There is an issue of data imbalance: 4000 cover images for 12000 stego images;
This can cause my model to work poorly due to biased training.

Before addressing this issue, I need to address that the benchmark paper also utilized a deep learning approach using neural networks in order to detect stego, but their model has some finetuning I must be missing since they got about 99.2% accuracy.