In [None]:
!pip install transformers==4.30.2
!pip install tensorflow==2.12.0
!pip install datasets==2.10.1
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install nltk
!pip install matplotlib
!pip install seaborn
!pip install tqdm
!pip install pillow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing import image
from PIL import Image

Set Up Image Data Generators:

In [None]:
IMAGE_SIZE = (128, 128)  # Adjust size as needed
BATCH_SIZE = 32

Create Data Generators for Training, Testing, and Verification Sets:

In [None]:
# Define the data augmentation for the training set
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

In [None]:
# For testing and verification, we only rescale
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
# Create generators
train_generator = train_datagen.flow_from_directory(
    directory='/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class',
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary'
)

In [None]:
test_generator = test_datagen.flow_from_directory(
    directory='/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class/Test',
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False  # Important for evaluation
)

In [None]:
verify_generator = test_datagen.flow_from_directory(
    directory='/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class/Validation',
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False  # Important for evaluation
)

Explanation:

train_datagen: Includes data augmentation to prevent overfitting.
test_datagen: Only rescales the images.
flow_from_directory: Automatically assigns labels based on the subdirectory names (fake and real).


Check Class Indices:

In [None]:
# Print class indices
print("Class indices: Fake, Real", train_generator.class_indices)

Expected Output:
Class indices: {'fake': 0, 'real': 1}

4. Building the Image Detection Model

Define the CNN Model:

In [None]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(*IMAGE_SIZE, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

Explanation:

Convolutional Layers: Extract features from images.
Pooling Layers: Reduce spatial dimensions.
Flatten Layer: Flattens the output for the dense layers.
Dense Layers: Perform classification.
Dropout Layer: Prevents overfitting.

Compile the Model:

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Explanation:

Optimizer: Adam optimizer.
Loss Function: Binary crossentropy for binary classification.
Metrics: Accuracy.

5. Training the Model

Set Up Callbacks:

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')


Train the Model:

In [None]:
EPOCHS = 10  # Adjust as needed

history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // BATCH_SIZE,
    validation_data=test_generator,
    validation_steps=test_generator.samples // BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping, model_checkpoint]
)

Explanation:

Steps per Epoch: Number of batches per epoch.
Validation Steps: Number of batches for validation.
Callbacks: Early stopping and model checkpointing to save the best model.

Plot Training History:

In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(8, 4))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

6. Evaluating the Model

Load the Best Model:

In [None]:
# Load the best saved model
model.load_weights('best_model.h5')

Evaluate on Test Data

In [None]:
test_loss, test_accuracy = model.evaluate(test_generator, steps=test_generator.samples // BATCH_SIZE)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Evaluate on Verify Data:

In [None]:
verify_loss, verify_accuracy = model.evaluate(verify_generator, steps=verify_generator.samples // BATCH_SIZE)
print(f'Verify Accuracy: {verify_accuracy * 100:.2f}%')

Generate Classification Report and Confusion Matrix:

In [None]:
# Get true labels and predictions for test data
test_generator.reset()
Y_pred = model.predict(test_generator, steps=test_generator.samples // BATCH_SIZE + 1)
y_pred = np.where(Y_pred > 0.5, 1, 0)

y_true = test_generator.classes[:len(y_pred)]

# Classification Report
print('Classification Report')
target_names = ['Fake', 'Real']
print(classification_report(y_true, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Test Data')
plt.show()

7. Testing the Model

Make Predictions on New Images:

In [None]:
# Function to predict and display result
def predict_image(image_path):
    img = image.load_img(image_path, target_size=IMAGE_SIZE)
    img_array = image.img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)
    prediction = model.predict(img_array)
    predicted_label = int(prediction[0][0] > 0.5)
    confidence = prediction[0][0] if predicted_label == 1 else 1 - prediction[0][0]
    label_map = {0: 'Fake', 1: 'Real'}
    label = label_map[predicted_label]
    print(f"Predicted Label: {label} with confidence {confidence * 100:.2f}%")
    plt.imshow(img)
    plt.axis('off')
    plt.show()

Test with an Image from the Verify Set:

In [None]:
# Get a random image from the verify set
import random

verify_image_files = []
for class_name in ['/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class/Validation/Fake', '/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class/Validation/Real']:
    class_dir = os.path.join('/content/drive/MyDrive/AGI House Hackathon Collaborate/Smaller Dataset 100 img per class/Validation', class_name)
    verify_image_files.extend([os.path.join(class_dir, fname) for fname in os.listdir(class_dir)])

# Select a random image
random_image = random.choice(verify_image_files)
print(f"Actual Label: {random_image.split('/')[-2].capitalize()}")

# Predict and display the image
predict_image(random_image)

Test with Your Own Image:

Upload an image to Colab:

In [None]:
from google.colab import files
uploaded = files.upload()

The uploaded file will be in the current directory.

Predict the Uploaded Image:

In [None]:
uploaded_image = list(uploaded.keys())[0]
predict_image(uploaded_image)