In [1]:
import numpy as np
import pandas as pd 
import cv2 as cv
import os
import random
import PIL
import pickle 
import matplotlib.pyplot as plt

In [3]:
data = r'D:\Vista 25\Dataset\vista-25\dataset'
sets = ['train', 'test']
categories = ['real', 'fake']

In [5]:

img_size = 64
all_train_images = []
test_images = []
random_seed = 42

# Set random seed for reproducibility
np.random.seed(random_seed)
random.seed(random_seed)

# Collect all image paths with their category labels
for dataset in sets:  # Renamed loop variable to avoid conflict
    path = os.path.join(data, dataset)  # Define the correct dataset path
    
    if dataset == 'test':  # Corrected string comparison
        for img in os.listdir(path):
            img_path = os.path.join(path, img)
            test_images.append(img_path)
        continue  # Skip further execution for test set
    
    for category in categories:
        category_path = os.path.join(path, category)  # Correct training path
        class_idx = categories.index(category)
        for img in os.listdir(category_path):
            img_path = os.path.join(category_path, img)
            all_train_images.append((img_path, class_idx))  # Use the correct list

# Print some sample outputs for verification
print(f"Total training images: {len(all_train_images)}")
print(f"Total test images: {len(test_images)}")
print("Sample train image:", all_train_images[:5])
print("Sample test image:", test_images[:5])


Total training images: 48000
Total test images: 12000
Sample train image: [('D:\\Vista 25\\Dataset\\vista-25\\dataset\\train\\real\\0001.jpg', 0), ('D:\\Vista 25\\Dataset\\vista-25\\dataset\\train\\real\\0002.jpg', 0), ('D:\\Vista 25\\Dataset\\vista-25\\dataset\\train\\real\\0003.jpg', 0), ('D:\\Vista 25\\Dataset\\vista-25\\dataset\\train\\real\\0004.jpg', 0), ('D:\\Vista 25\\Dataset\\vista-25\\dataset\\train\\real\\0005.jpg', 0)]
Sample test image: ['D:\\Vista 25\\Dataset\\vista-25\\dataset\\test\\0.jpg', 'D:\\Vista 25\\Dataset\\vista-25\\dataset\\test\\1.jpg', 'D:\\Vista 25\\Dataset\\vista-25\\dataset\\test\\10.jpg', 'D:\\Vista 25\\Dataset\\vista-25\\dataset\\test\\100.jpg', 'D:\\Vista 25\\Dataset\\vista-25\\dataset\\test\\1000.jpg']


In [9]:
random.shuffle(all_train_images)

In [11]:
from tqdm import tqdm

# Initialize storage
training_data = []
test_data = []
corrupted_train_indices = []
corrupted_test_indices = []

# Function to process images
def process_images(image_list, dataset_name):
    processed_data = []
    corrupted_indices = []

    print(f"\nProcessing {dataset_name} images...\n")
    with tqdm(total=len(image_list), desc=f"Processing {dataset_name}") as progress_bar:
        for idx, item in enumerate(image_list):
            try:
                if dataset_name == "train":
                    img_path, class_idx = item  # Training data has labels
                else:
                    img_path = item  # Test data has no labels
                
                img_array = cv.imread(img_path)  
                
                # Check for corrupted images
                if img_array is None:
                    print(f"Skipping corrupted image: {img_path}")
                    corrupted_indices.append(idx)
                    continue
                
                # Ensure the image has 3 channels (RGB)
                if img_array.shape[-1] != 3:
                    print(f"Skipping non-RGB image: {img_path}")
                    corrupted_indices.append(idx)
                    continue
                
                # Resize and normalize
                new_array = cv.resize(img_array, (img_size, img_size)) / 255.0  

                # Append processed data
                if dataset_name == "train":
                    processed_data.append([new_array, class_idx])
                else:
                    processed_data.append(new_array)

            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")
                corrupted_indices.append(idx)
                continue

            # Update progress bar
            progress_bar.update(1)

    print(f"\nFinished processing {dataset_name}. Valid images: {len(processed_data)}, Corrupted: {len(corrupted_indices)}\n")
    return processed_data, corrupted_indices

# Process Training Data
training_data, corrupted_train_indices = process_images(all_train_images, "train")

# Process Test Data
test_data, corrupted_test_indices = process_images(test_images, "test")

# Print results
print(f"Final training dataset size: {len(training_data)} images")
print(f"Final test dataset size: {len(test_data)} images")
print(f"Corrupted training images: {corrupted_train_indices}")
print(f"Corrupted test images: {corrupted_test_indices}")


Processing train images...



Processing train:  33%|███████████████████▏                                      | 15903/48000 [11:42<20:48, 25.72it/s]

Skipping corrupted image: D:\Vista 25\Dataset\vista-25\dataset\train\real\11957.jpg


Processing train: 100%|█████████████████████████████████████████████████████████▉| 47999/48000 [36:05<00:00, 22.17it/s]



Finished processing train. Valid images: 47999, Corrupted: 1


Processing test images...



Processing test:  79%|███████████████████████████████████████████████▎            | 9466/12000 [07:22<01:07, 37.50it/s]

Skipping corrupted image: D:\Vista 25\Dataset\vista-25\dataset\test\7711.jpg


Processing test: 100%|██████████████████████████████████████████████████████████▉| 11999/12000 [09:22<00:00, 21.32it/s]


Finished processing test. Valid images: 11999, Corrupted: 1

Final training dataset size: 47999 images
Final test dataset size: 11999 images
Corrupted training images: [15901]
Corrupted test images: [9459]





In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

In [14]:
print("Shape of first image:", training_data[0][0].shape)

Shape of first image: (64, 64, 3)


In [17]:
from sklearn.model_selection import train_test_split
import numpy as np

# Extract features and labels
X = np.array([item[0] for item in training_data], dtype=np.float32)
y = np.array([item[1] for item in training_data], dtype=np.int32)

# Split into training and validation sets (80% train, 20% validation)
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}")


x_train shape: (38399, 64, 64, 3)
y_train shape: (38399,)
x_val shape: (9600, 64, 64, 3)
y_val shape: (9600,)


In [352]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [418]:
tf.keras.backend.clear_session()

In [420]:
base_model = VGG16(
        weights='imagenet',
        include_top=False,
        input_shape=(64,64,3)
    )

In [422]:
    base_model.trainable = False
    
    # Then unfreeze only the last 5 layers
    for layer in base_model.layers[-4:]:
        layer.trainable = True
    
    # Print layer names and their trainable status
    for layer in base_model.layers:
        print(f"{layer.name}: {layer.trainable}")

input_layer: False
block1_conv1: False
block1_conv2: False
block1_pool: False
block2_conv1: False
block2_conv2: False
block2_pool: False
block3_conv1: False
block3_conv2: False
block3_conv3: False
block3_pool: False
block4_conv1: False
block4_conv2: False
block4_conv3: False
block4_pool: False
block5_conv1: True
block5_conv2: True
block5_conv3: True
block5_pool: True


In [426]:
model = tf.keras.Sequential([
    base_model,
    MaxPooling2D(pool_size=(2, 2)),  # Replace GlobalAveragePooling2D with MaxPooling2D
    tf.keras.layers.Flatten(),       # Flatten the output for Dense layers
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

In [428]:
model.summary()

In [430]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)

In [432]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.1,
    brightness_range=[0.8, 1.2],
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    preprocessing_function=lambda x: tf.image.random_contrast(x, lower=0.8, upper=1.2),
    fill_mode='nearest'
)

# Fit the generator on your training data
datagen.fit(x_train)

history = model.fit(
    x_train, y_train,  # Replace with your training data
    batch_size=32,
    epochs=5,
    validation_data=(x_val, y_val),  # Replace with your validation data
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

Epoch 1/5
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 227ms/step - accuracy: 0.7057 - loss: 0.5750 - val_accuracy: 0.7867 - val_loss: 0.4545 - learning_rate: 0.0010
Epoch 2/5
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 257ms/step - accuracy: 0.7846 - loss: 0.4673 - val_accuracy: 0.8081 - val_loss: 0.4241 - learning_rate: 0.0010
Epoch 3/5
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 248ms/step - accuracy: 0.8196 - loss: 0.4135 - val_accuracy: 0.8083 - val_loss: 0.4134 - learning_rate: 0.0010
Epoch 4/5
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 257ms/step - accuracy: 0.8352 - loss: 0.3803 - val_accuracy: 0.8173 - val_loss: 0.4352 - learning_rate: 0.0010
Epoch 5/5
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 253ms/step - accuracy: 0.8505 - loss: 0.3458 - val_accuracy: 0.8165 - val_loss: 0.4337 - learning_rate: 0.0010


In [None]:
model.save("model5.keras")

In [434]:
import numpy as np
import pandas as pd

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict_and_save(model, test_data, corrupted_test_indices, total_images=12000):
    # Get raw predictions 
    predictions = model.predict(np.array(test_data), verbose=1)
    
    # Apply sigmoid
    predicted_classes = sigmoid(predictions)
    
    all_predictions = []
    valid_idx = 0 
    
    # Loop through all possible indices
    for i in range(total_images):
        if i in corrupted_test_indices:
            prediction = 0.5
        else:
            # For valid images, use the prediction from our model
            prediction = predicted_classes[valid_idx]
            valid_idx += 1
        
        all_predictions.append(prediction)
    
    # Create DataFrame with predictions
    df = pd.DataFrame({
        'image_id': [i for i in range(total_images)],
        'label': all_predictions
    })
    
    # Save to CSV
    df.to_csv('test1.csv', index=False)
    print("\nPredictions saved to test1.csv")
    print(f"Total images: {total_images}")
    print(f"Valid predictions: {len(test_data)}")
    print(f"Corrupted images: {len(corrupted_test_indices)}")
    
    # Print summary of corrupted images
    if corrupted_test_indices:
        print("\nCorrupted image indices:", corrupted_test_indices)
        print("These images were marked with prediction value: 0.5")
    
    return df

# Use the function
predictions_df = predict_and_save(model, test_data, corrupted_test_indices)

# Print first few predictions to verify
print("\nFirst few predictions:")
print(predictions_df.head())

[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 142ms/step

Predictions saved to test1.csv
Total images: 12000
Valid predictions: 11999
Corrupted images: 1

Corrupted image indices: [9459]
These images were marked with prediction value: 0.5

First few predictions:
   image_id        label
0         0  [0.6472758]
1         1  [0.5080768]
2         2  [0.5670067]
3         3  [0.6922663]
4         4  [0.7273329]
