In [3]:
import cv2
import torch
import torch.nn as nn
import clip
from torchvision import transforms
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the fine-tuned CLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch16")

# Define the classification heads
num_image_classes = 2  # Replace with the number of image classes in your dataset
num_text_classes = 2   # Replace with the number of text classes in your dataset

image_classifier_head = nn.Linear(512, num_image_classes).to(device)
image_classifier_head.load_state_dict(torch.load('nature_model.pt'))
image_classifier_head.eval()

text_classifier_head = nn.Linear(512, num_text_classes).to(device)
text_classifier_head.load_state_dict(torch.load('text_model.pt'))
text_classifier_head.eval()

# Define the mapping between class indices and class labels for images
image_class_mapping = {
    0: "Islamophobic Image",
    1: "Non Islamophobic Image",
}

# Define the mapping between class indices and class labels for text
text_class_mapping = {
    0: "Islamophobic Text",
    1: "Non Islamophobic Text",
}

# Define the transformation to normalize and convert image to tensor
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Function to predict the class of an image
def predict_image_class(image_path):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))  # Resize image to fit CLIP model input shape
    image = transform(image).unsqueeze(0).to(device)  # Convert image to tensor and add batch dimension

    with torch.no_grad():
        image_features = model.encode_image(image)

    # Convert the data type of image_features to match the classifier's weight matrix
    image_features = image_features.to(image_classifier_head.weight.dtype)

    logits = image_classifier_head(image_features)
    probabilities = nn.functional.softmax(logits, dim=1)
    predicted_class_index = torch.argmax(probabilities, dim=1).item()
    predicted_class_label = image_class_mapping[predicted_class_index]
    
    return predicted_class_label

# Function to predict the class of a text
def predict_text_class(text):
    text_input = clip.tokenize([text]).to(device)
    
    with torch.no_grad():
        text_features = model.encode_text(text_input)
    
    logits = text_classifier_head(text_features)
    probabilities = nn.functional.softmax(logits, dim=1)
    predicted_class_index = torch.argmax(probabilities, dim=1).item()
    predicted_class_label = text_class_mapping[predicted_class_index]
    
    return predicted_class_label

# Test the prediction functions
image_path = 'image.jpg'  # Replace with the path to your test image
input_text = "This is a test text for evaluation."  # Replace with your test text

predicted_image_class = predict_image_class(image_path)
predicted_text_class = predict_text_class(input_text)

print(f"Predicted Image Class: {predicted_image_class}")
print(f"Predicted Text Class: {predicted_text_class}")


Predicted Image Class: Non Islamophobic Image
Predicted Text Class: Non Islamophobic Text
