<a href="https://colab.research.google.com/github/tanmay-ps/DL101-Project1-C-A-/blob/main/Final_AI_Image_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#                                AI IMAGE CLASSIFIER

##### This project involved building a machine learning model that can effectively classify real images and those generated by AI. The first step was to create a dataset comprising AI-generated and real images. To accomplish this, pygoogle_image is used , with which images from Google are downloaded. Apart from Google Images, two more datasets are used. Then, a Convulutional Nueral Network based Classifier model was constructed and trained on the dataset containing around 1,00,000 images.

#### First, let's import all the necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import os
import cv2 as cv
from pygoogle_image import image as pi
import random
import PIL
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, Sequential
from tensorflow.keras.layers import BatchNormalization, Activation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report


# ====================================================================
# PART 1: DATA GATHERING & PREPROCESSING (BASELINE)
# ====================================================================

### Generating a Dataset

##### Here, we used pygoogle_image library to download google images
# Note: This cell is commented out as it only needs to be run once.
# pi.download(keywords="ai generated images", limit=100, directory='./ai_generated/')
# pi.download(keywords="ai generated art", limit=100, directory='./ai_generated/')
# pi.download(keywords="ai generated characters", limit=100, directory='./ai_generated/')
# pi.download(keywords='stable diffusion', limit=100, directory='./ai_generated/')
# pi.download(keywords='dalle2 generated images', limit=100, directory='./ai_generated/')
# pi.download(keywords='midjourney', limit=100, directory='./ai_generated/')
# pi.download(keywords='landscapes', limit=100, directory='./real/')
# pi.download(keywords='cityscapes', limit=100, directory='./real/')
# pi.download(keywords='animals', limit=100, directory='./real/')
# pi.download(keywords='vehicles', limit=50, directory='./real/')
# pi.download(keywords='traffic', limit=50, directory='./real/')
# pi.download(keywords='offices', limit=50, directory='./real/')
# pi.download(keywords='real food images', limit=50, directory='./real/')

##### Since, the data collected from google images is not enought for training a model, we have used two more datasets
#
# 1. CIFAKE: Real and AI-Generated Synthetic Images
# 2. Ai Generated Images | Images Created using Ai from Kaggle

#### Now, we will be pre-processing the data
# This cell loads the 100k images from the external datasets and saves them to pickle files.

# data = "./dataset_train/"
# categories = ['Real', 'AIGenerated']
# img_size = 48
# training_data = []

# i = 0
# for category in categories:
#     path = os.path.join(data,category)
#     classes = categories.index(category)
#     for img in os.listdir(path):
#         i = i + 1
#         img_array = cv.imread(os.path.join(path,img))
#         if img_array is None:
#             continue
#         new_array = cv.resize(img_array, (48,48))
#         new_array = new_array/255
#         training_data.append([new_array, classes])

# random.shuffle(training_data)

# X_train = []
# y_train = []

# for features, label in training_data:
#     X_train.append(features)
#     y_train.append(label)

# X_train = np.array(X_train).reshape(-1, img_size, img_size, 3)
# y_train = np.array(y_train)

#### Storing X_train and y_train
# pickle_out = open("X_train.pickle", "wb")
# pickle.dump(X_train, pickle_out, protocol=4)
# pickle_out.close()

# pickle_out = open("y_train.pickle", "wb")
# pickle.dump(y_train, pickle_out, protocol=4)
# pickle_out.close()

# ====================================================================
# PART 2: BASELINE MODEL (TRAINED & SAVED)
# ====================================================================

# Load the preprocessed data from pickle files
# (Assuming this data is already generated and present)
pickle_in = open("X_train.pickle", "rb")
X_train = pickle.load(pickle_in)
pickle_in.close()

pickle_in = open("y_train.pickle", "rb")
y_train = pickle.load(pickle_in)
pickle_in.close()

print(f"Loaded X_train shape: {X_train.shape}")
print(f"Loaded y_train shape: {y_train.shape}")

## Creating the Baseline Model
model_baseline = keras.Sequential([
    keras.layers.Conv2D(32,(3,3), activation='relu', input_shape = (48,48,3)),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Dropout(0.2),

    keras.layers.Conv2D(64,(3,3), activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Dropout(0.2),

    keras.layers.Conv2D(128,(3,3), activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Dropout(0.2),

    keras.layers.Conv2D(256,(3,3), activation='relu'),
    keras.layers.MaxPool2D((2,2)),
    keras.layers.Dropout(0.2),

    keras.layers.Flatten(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model_baseline.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_baseline.summary()

## Training the Baseline Model
# (Skipping this as the notebook shows it's already done and saved)
# model_baseline.fit(X_train, y_train, epochs=15)
# model_baseline.save("AIGeneratedModel.h5")


## Evaluating the Baseline Model
print("Loading saved baseline model for testing...")
model_new = keras.models.load_model("AIGeneratedModel.h5")

# Load and process test data
data_test = "./dataset_test/"
categories_test = ['Real', 'AIGenerated']
img_size = 48
testing_data = []

i = 0
for category in categories_test:
    path = os.path.join(data_test, category)
    if not os.path.exists(path):
        print(f"Test directory not found: {path}. Skipping evaluation.")
        break

    classes = categories_test.index(category)
    for img in os.listdir(path):
        i = i + 1
        img_array = cv.imread(os.path.join(path,img))
        if img_array is None:
            continue
        new_array = cv.resize(img_array, (48,48))
        new_array = new_array/255
        testing_data.append([new_array, classes])

if testing_data:
    random.shuffle(testing_data)

    X_test = []
    y_test = []

    for features, label in testing_data:
        X_test.append(features)
        y_test.append(label)

    X_test = np.array(X_test).reshape(-1, img_size, img_size, 3)
    y_test = np.array(y_test)

    print("Evaluating baseline model on test data...")
    model_new.evaluate(X_test, y_test)

    y_pred = model_new.predict(X_test)
    y_predicted = [1 if arr[0] > 0.5 else 0 for arr in y_pred]

    print("\nClassification Report (Baseline Model):")
    print(classification_report(y_test, y_predicted))


## Testing (Demo Function)
def find_out(path_img, model_to_use):
    img_arr = cv.imread(path_img)
    if img_arr is None:
        print(f"Error: Could not read image at {path_img}")
        return

    plt.imshow(cv.cvtColor(img_arr, cv.COLOR_BGR2RGB)) # Convert BGR to RGB for plt
    new_arr = cv.resize(img_arr, (48,48))
    new_arr = new_arr/255
    test = np.array(new_arr).reshape(-1, img_size, img_size, 3)

    y = model_to_use.predict(test)

    if y[0][0] <= 0.5:
        print("Prediction: The given image is Real.")
    else:
        print("Prediction: The given image is AI Generated.")
    plt.show()

# Demo with baseline model
print("\n--- Baseline Model Demo ---")
path_img_real = './Testing/Real.jpeg'
if os.path.exists(path_img_real):
    find_out(path_img_real, model_new)
else:
    print(f"Demo image not found: {path_img_real}")

path_img_ai = './Testing/AIGenerated.png'
if os.path.exists(path_img_ai):
    find_out(path_img_ai, model_new)
else:
    print(f"Demo image not found: {path_img_ai}")


# ====================================================================
# PART 3: UPDATED MODEL (DATA AUGMENTATION & BATCH NORM)
# ====================================================================

## 1. Introduce Data Augmentation
# This loads X_train and y_train again, as they are needed for the generator.
# (This step was missing in the original notebook's final cells)
try:
    pickle_in = open("X_train.pickle", "rb")
    X_train_aug = pickle.load(pickle_in)
    pickle_in.close()

    pickle_in = open("y_train.pickle", "rb")
    y_train_aug = pickle.load(pickle_in)
    pickle_in.close()

    print("\nData loaded for augmentation.")

    # Instantiate ImageDataGenerator
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    # Fit the ImageDataGenerator
    datagen.fit(X_train_aug)

    # Create an augmented data generator
    train_datagen = datagen.flow(X_train_aug, y_train_aug, batch_size=32)
    print("ImageDataGenerator instantiated and fitted.")

except FileNotFoundError:
    print("\nWarning: X_train.pickle or y_train.pickle not found.")
    print("Skipping augmentation and training of the new model.")
    train_datagen = None


## 2. Refine Model Architecture (with Batch Normalization)
print("\nDefining new model with Batch Normalization...")
model_updated = keras.Sequential([
    layers.Conv2D(32,(3,3), input_shape = (48,48,3)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D((2,2)),
    layers.Dropout(0.2),

    layers.Conv2D(64,(3,3)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D((2,2)),
    layers.Dropout(0.2),

    layers.Conv2D(128,(3,3)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D((2,2)),
    layers.Dropout(0.2),

    layers.Conv2D(256,(3,3)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D((2,2)),
    layers.Dropout(0.2),

    layers.Flatten(),
    layers.Dense(64),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    layers.Dense(64),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    layers.Dense(1, activation='sigmoid')
])

model_updated.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_updated.summary()


## 3. Train the Updated Model
# This step trains the new model using the augmented data generator
if train_datagen:
    print("\nTraining updated model with augmented data...")
    # Calculate steps_per_epoch
    steps_per_epoch = len(X_train_aug) // 32  # 32 is the batch_size

    history_updated = model_updated.fit(
        train_datagen,
        steps_per_epoch=steps_per_epoch,
        epochs=15  # You can increase epochs as augmentation reduces overfitting
    )

    print("Training of updated model complete.")

    # Save the new model
    model_updated.save("AIGeneratedModel_Updated.h5")
    print("Updated model saved as AIGeneratedModel_Updated.h5")

    # (Optional) Evaluate the new model on the test set
    if 'X_test' in locals():
        print("\nEvaluating updated model on test data...")
        model_updated.evaluate(X_test, y_test)

        y_pred_updated = model_updated.predict(X_test)
        y_predicted_updated = [1 if arr[0] > 0.5 else 0 for arr in y_pred_updated]

        print("\nClassification Report (Updated Model):")
        print(classification_report(y_test, y_predicted_updated))
else:
    print("\nSkipping training of updated model as data was not loaded.")

ModuleNotFoundError: No module named 'pygoogle_image'