In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import random

In [None]:
images_path = '/home/umera_p/Dataset/training/memes'

def display_images_grid(image_paths, title):
    fig, axes = plt.subplots(nrows=10, ncols=5, figsize=(15, 30))
    fig.suptitle(title, fontsize=20)
    for ax, image_path in zip(axes.flatten(), image_paths):
        try:
            image = Image.open(image_path)
            ax.imshow(image)
            ax.axis('off')
        except FileNotFoundError:
            ax.axis('off')
    plt.tight_layout()
    plt.subplots_adjust(top=0.95)
    plt.show()

spanish_ids = [str(i) for i in range(110001, 112035)]
english_ids = [str(i) for i in range(210001, 212011)]

spanish_images = []
for image_id in spanish_ids[:50]:
    for ext in ['jpeg', 'jpg', 'png']:
        image_path = os.path.join(images_path, f"{image_id}.{ext}")
        if os.path.exists(image_path):
            spanish_images.append(image_path)
            break

english_images = []
for image_id in english_ids[:50]:
    for ext in ['jpeg', 'jpg', 'png']:
        image_path = os.path.join(images_path, f"{image_id}.{ext}")
        if os.path.exists(image_path):
            english_images.append(image_path)
            break

display_images_grid(spanish_images, 'First 50 Spanish Images')

display_images_grid(english_images, 'First 50 English Images')

In [None]:
annotations_path = '/home/umera_p/Dataset/training/training.json'
with open(annotations_path, 'r') as file:
    annotations = json.load(file)

df = pd.DataFrame.from_dict(annotations, orient='index')
display(df.head())

In [None]:
sexist_count = sum(1 for value in annotations.values() if value['labels_task4'].count('YES')>value['labels_task4'].count('NO'))
non_sexist_count = len(annotations) - sexist_count
sexist_count_es = sum(1 for value in annotations.values() if value['lang'] == 'es' and value['labels_task4'].count('YES')>value['labels_task4'].count('NO'))
non_sexist_count_es = sum(1 for value in annotations.values() if value['lang'] == 'es' and value['labels_task4'].count('YES')<=value['labels_task4'].count('NO'))
sexist_count_en = sum(1 for value in annotations.values() if value['lang'] == 'en' and value['labels_task4'].count('YES')>value['labels_task4'].count('NO'))
non_sexist_count_en = sum(1 for value in annotations.values() if value['lang'] == 'en' and value['labels_task4'].count('YES')<=value['labels_task4'].count('NO'))

print(f"Total number of images: {len(annotations)}")
print(f"Number of sexist images: {sexist_count}")
print(f"Number of sexist images (Spanish): {sexist_count_es}")
print(f"Number of sexist images (English): {sexist_count_en}")
print(f"Number of non-sexist images: {non_sexist_count}")
print(f"Number of non-sexist images (Spanish): {non_sexist_count_es}")
print(f"Number of non-sexist images (English): {non_sexist_count_en}")

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms 

In [None]:
preprocess_transform = transforms.Compose([
    transforms.Resize(256),  
    transforms.CenterCrop(224),  
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

extensions = ['.jpeg', '.jpg', '.png']
preprocessed_images = []
labels_list = []

In [None]:
for image_id, label in image_labels.items():
    image_loaded = False
    for ext in extensions:
        filename = f"{image_id}{ext}"
        image_path = os.path.join(dataset_path, filename)
        if os.path.exists(image_path):
            try:
                image = Image.open(image_path).convert("RGB")  
                preprocessed_image = preprocess_transform(image)
                preprocessed_images.append(preprocessed_image)
                labels_list.append(label)  
                image_loaded = True
                break  
            except Exception as e:
                print(f"Failed to process image {filename}: {e}")
    
    if not image_loaded:
        print(f"Image {image_id} could not be loaded with any known extension.")

preprocessed_images_tensor = torch.stack(preprocessed_images) if preprocessed_images else None
encoded_labels_tensor = torch.tensor(labels_list, dtype=torch.long) if labels_list else None

if preprocessed_images_tensor is not None and encoded_labels_tensor is not None:
    dataset = TensorDataset(preprocessed_images_tensor, encoded_labels_tensor)

    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    print("Shape of preprocessed images tensor:", preprocessed_images_tensor.shape)
    print("Shape of encoded labels tensor:", encoded_labels_tensor.shape)
else:
    print("No images were loaded successfully.")

In [None]:
# !pip list
# import sys
# print(sys.executable)
# !pip install clip
# pip install gitpython
# pip install torch torchvision torchaudio
# ! pip install ftfy regex tqdm
# pip install --upgrade clip
# ls "C:\Users\Umera Pasha\Desktop\MASTERS THESIS PREP\CLIP"

In [None]:
import clip
import torch

In [None]:
clip.available_models()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

In [None]:
def extract_image_features(image_paths, batch_size=32):
    image_features = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        batch_images = torch.stack([preprocess(Image.open(path).convert('RGB')).to(device) for path in batch_paths])
        with torch.no_grad():
            batch_features = model.encode_image(batch_images)
        image_features.append(batch_features.cpu().numpy())
    return np.concatenate(image_features, axis=0)

def get_all_image_paths(dataset_path, extensions=['.jpeg', '.jpg', '.png']):
    image_paths = []
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if any(file.endswith(ext) for ext in extensions):
                image_paths.append(os.path.join(root, file))
    return image_paths

image_paths = get_all_image_paths(dataset_path)

image_features = extract_image_features(image_paths)

print("Shape of extracted features array:", image_features.shape)

In [None]:
image_features

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
def stratified_split(dataset, split_ratios=(0.8, 0.1, 0.1)):
    assert sum(split_ratios) == 1.0, "Split ratios must sum to 1.0"

    non_sexist_es = [key for key, value in dataset.items() if value['lang'] == 'es' and value['labels_task4'].count('YES')<=value['labels_task4'].count('NO')]
    non_sexist_en = [key for key, value in dataset.items() if value['lang'] == 'en' and value['labels_task4'].count('YES')<=value['labels_task4'].count('NO')]
    sexist_es = [key for key, value in dataset.items() if value['lang'] == 'es' and value['labels_task4'].count('YES')>value['labels_task4'].count('NO')]
    sexist_en = [key for key, value in dataset.items() if value['lang'] == 'en' and value['labels_task4'].count('YES')>value['labels_task4'].count('NO')]

    random.shuffle(non_sexist_es)
    random.shuffle(non_sexist_en)
    random.shuffle(sexist_es)
    random.shuffle(sexist_en)

    def split_list(data_list, split_ratios):
        train_size = round(int(split_ratios[0] * len(data_list)))
        val_size = round(int(split_ratios[1] * len(data_list)))
        train_split = data_list[:train_size]
        val_split = data_list[train_size:train_size + val_size]
        test_split = data_list[train_size + val_size:]
        return train_split, val_split, test_split

    train_non_sexist_es, val_non_sexist_es, test_non_sexist_es = split_list(non_sexist_es, split_ratios)
    train_non_sexist_en, val_non_sexist_en, test_non_sexist_en = split_list(non_sexist_en, split_ratios)
    train_sexist_es, val_sexist_es, test_sexist_es = split_list(sexist_es, split_ratios)
    train_sexist_en, val_sexist_en, test_sexist_en = split_list(sexist_en, split_ratios)

    train_keys = train_non_sexist_es + train_non_sexist_en + train_sexist_es + train_sexist_en
    val_keys = val_non_sexist_es + val_non_sexist_en + val_sexist_es + val_sexist_en
    test_keys = test_non_sexist_es + test_non_sexist_en + test_sexist_es + test_sexist_en

    random.shuffle(train_keys)
    random.shuffle(val_keys)
    random.shuffle(test_keys)

    return train_keys, val_keys, test_keys

train_keys, val_keys, test_keys = stratified_split(annotations)

print(f"Training set size: {len(train_keys)}")
print(f"Validation set size: {len(val_keys)}")
print(f"Known test set size: {len(test_keys)}")

print("\nTraining IDs:", train_keys)
print("\nValidation IDs:", val_keys)
print("\nKnown Test IDs:", test_keys)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random

def get_features_and_labels(keys, feature_array, annotations, id_to_index):
    features = []
    labels = []
    for key in keys:
        index = id_to_index[key]
        features.append(feature_array[index])
        labels.append(1 if annotations[key]['labels_task4'].count('YES') > annotations[key]['labels_task4'].count('NO') else 0)
    return np.array(features), np.array(labels)

id_to_index = {key: i for i, key in enumerate(annotations.keys())}

train_features, train_labels = get_features_and_labels(train_keys, image_features, annotations, id_to_index)
val_features, val_labels = get_features_and_labels(val_keys, image_features, annotations, id_to_index)
test_features, test_labels = get_features_and_labels(test_keys, image_features, annotations, id_to_index)

# Logistic Regression

In [None]:
logistic_model = LogisticRegression(max_iter=1000) 

logistic_model.fit(train_features, train_labels)

val_predictions = logistic_model.predict(val_features)
test_predictions = logistic_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42) 

random_forest_model.fit(train_features, train_labels)

val_predictions = random_forest_model.predict(val_features)
test_predictions = random_forest_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# SVM Model

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', C=1.0, random_state=42) 

svm_model.fit(train_features, train_labels)

val_predictions = svm_model.predict(val_features)
test_predictions = svm_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# Decision Tree Model 

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(random_state=42) 

decision_tree_model.fit(train_features, train_labels)

val_predictions = decision_tree_model.predict(val_features)
test_predictions = decision_tree_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# XGBoost Model

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_model.fit(train_features, train_labels)

val_predictions = xgb_model.predict(val_features)
test_predictions = xgb_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# AdaBoost Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_model = AdaBoostClassifier(n_estimators=50, random_state=42) 

adaboost_model.fit(train_features, train_labels)

val_predictions = adaboost_model.predict(val_features)
test_predictions = adaboost_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# SGD Model

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42) 

sgd_model.fit(train_features, train_labels)

val_predictions = sgd_model.predict(val_features)
test_predictions = sgd_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# MLP Model

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=42)

mlp_model.fit(train_features, train_labels)

val_predictions = mlp_model.predict(val_features)
test_predictions = mlp_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))

# CatBoost Model

In [None]:
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=0)

catboost_model.fit(train_features, train_labels)

val_predictions = catboost_model.predict(val_features)
test_predictions = catboost_model.predict(test_features)

val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))