In [1]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import img_to_array

In [2]:
def extract_frames(video_path, frame_limit=30, resize=(224, 224)):
    video = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while video.isOpened() and count < frame_limit:
        ret, frame = video.read()
        if not ret:
            break
        frame = cv2.resize(frame, resize)
        frame = img_to_array(frame)
        frame = frame / 255.0
        frames.append(frame)
        count += 1
    video.release()
    return np.array(frames)

In [3]:
def build_model(input_shape):
    cnn_base = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    
    cnn_model = Sequential()
    cnn_model.add(cnn_base)
    cnn_model.add(Flatten())
    
    model = Sequential()
    model.add(TimeDistributed(cnn_model, input_shape=input_shape))
    model.add(LSTM(256, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [4]:
# build_model.summary()

In [4]:
video_path = r'C:\Users\Suyash Tambe\Desktop\Deepfakevideo\train dataset\eqjscdagiv.mp4'
frames = extract_frames(video_path, frame_limit=100)
X_input = np.expand_dims(frames, axis=0)

In [5]:
input_shape = (30, 224, 224, 3)

model = build_model(input_shape)

prediction = model.predict(X_input)

print(prediction[0][0])

  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33s/step
0.54749846


In [6]:
threshold = 0.7

if prediction[0][0] > threshold:
    print("The video is a DEEP FAKE.")                               
if prediction[0][0] < threshold:
    print("The video is RIYAL.")

The image is RIYAL.


**Cogvlm**

In [1]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Flatten, Dropout, Input, Concatenate
from tensorflow.keras.models import Model, Sequential  # Import Sequential
from tensorflow.keras.preprocessing.image import img_to_array
from transformers import TFBertModel, BertTokenizer  # For language part

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# Load tokenizer and BERT model for language processing (COG-VLM component)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [6]:
# Function to extract frames from the video and preprocess them
def extract_frames(video_path, frame_limit=30, resize=(224, 224)):
    video = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while video.isOpened() and count < frame_limit:
        ret, frame = video.read()
        if not ret:
            break
        frame = cv2.resize(frame, resize)
        frame = img_to_array(frame)
        frame = frame / 255.0
        frames.append(frame)
        count += 1
    video.release()
    return np.array(frames)

In [7]:
# Function to preprocess the language input (e.g., description or prompt)
def preprocess_language_input(text, max_length=30):
    inputs = tokenizer(text, return_tensors="tf", max_length=max_length, truncation=True, padding='max_length')
    return inputs['input_ids'], inputs['attention_mask']


In [8]:
# Build the vision model using VGG16 and LSTM (for temporal sequence)
def build_cogvlm_model(input_shape, text_input_shape, bert_model):
    # CNN Model for visual feature extraction
    cnn_base = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    cnn_model = Sequential()  # Use Sequential for the CNN model
    cnn_model.add(cnn_base)
    cnn_model.add(Flatten())

    # Video input (image sequences)
    video_input = Input(shape=input_shape, name="video_input")
    video_features = TimeDistributed(cnn_model)(video_input)
    video_features = LSTM(256, return_sequences=False)(video_features)
    video_features = Dropout(0.5)(video_features)

    # Language input (text, using BERT)
    text_input_ids = Input(shape=text_input_shape, name="text_input_ids", dtype='int32')
    text_attention_mask = Input(shape=text_input_shape, name="text_attention_mask", dtype='int32')
    
    # Convert Keras tensors to TensorFlow tensors
    text_input_ids_tensor = tf.convert_to_tensor(text_input_ids)
    text_attention_mask_tensor = tf.convert_to_tensor(text_attention_mask)

    # BERT model for processing text input
    text_features = bert_model(input_ids=text_input_ids_tensor, attention_mask=text_attention_mask_tensor)[1]  # Get the pooled output (CLS token)

    # Combine video and text features
    combined_features = Concatenate()([video_features, text_features])

    # Final output layer (binary classification)
    output = Dense(1, activation='sigmoid')(combined_features)

    # Define the full model
    model = Model(inputs=[video_input, text_input_ids, text_attention_mask], outputs=output)
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [9]:
# Example usage with a video and a text prompt
video_path = r'C:\Users\Suyash Tambe\Desktop\Deepfakevideo\train dataset\eqjscdagiv.mp4'
frames = extract_frames(video_path, frame_limit=100)

# Language description (replace with the actual text context for your video)
text_description = "This is a deepfake video example."

In [10]:
# Preprocess language input
input_ids, attention_mask = preprocess_language_input(text_description, max_length=30)

# Expand dimensions for batch size
X_input_video = np.expand_dims(frames, axis=0)
input_ids = np.expand_dims(input_ids, axis=0)
attention_mask = np.expand_dims(attention_mask, axis=0)

# Build the COG-VLM model
input_shape = (100, 224, 224, 3)  # For 100 frames, 224x224 image size, 3 channels
text_input_shape = (30,)  # Based on BERT input max length

model = build_cogvlm_model(input_shape, text_input_shape, bert_model)

# Make predictions on video + text
prediction = model.predict([X_input_video, input_ids, attention_mask])

# Print the prediction result
print(f"Prediction: {prediction[0][0]}")

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
