In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else \
         ("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

cuda


In [3]:
df_train = pd.read_csv('text_dataset_train.tsv', sep='\t')
df_test = pd.read_csv('text_dataset_test.tsv', sep='\t')
df_val = pd.read_csv('text_dataset_val.tsv', sep='\t')

print("Training set size:", len(df_train))
print("Test set size:", len(df_test))
print("Validation set size:", len(df_val))

Training set size: 43793
Test set size: 5475
Validation set size: 5474


In [4]:
multi_traindata = df_train
multi_validata = df_test
multi_testdata = df_val

In [5]:
train_data = multi_traindata[multi_traindata['clean_title'].notna()]

valid_data = multi_validata[multi_validata['clean_title'].notna()]

test_data = multi_testdata[multi_testdata['clean_title'].notna()]

In [6]:
train_labels, train_images, train_id = train_data['6_way_label'].tolist(), train_data['id'].tolist(), train_data['id'].tolist()
valid_labels, valid_images, valid_id = valid_data['6_way_label'].tolist(), valid_data['id'].tolist(), valid_data['id'].tolist()
test_labels, test_images, test_id = test_data['6_way_label'].tolist(), test_data['id'].tolist(), test_data['id'].tolist()

In [7]:
def add_suffix(images, suffix='.jpg'):
    return ['image_dataset/'+ image + suffix for image in images]

train_images_final = add_suffix(train_images)
valid_images_final = add_suffix(valid_images)
test_images_final = add_suffix(test_images)

In [8]:
from transformers import CLIPTokenizerFast, CLIPProcessor, CLIPModel
import torch

# if you have CUDA or MPS, set it to the active device like this

model_id = "openai/clip-vit-base-patch32"

# we initialize a tokenizer and image processor, and the model itself
tokenizer = CLIPTokenizerFast.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id).to(device)

  from .autonotebook import tqdm as notebook_tqdm


# preparing embeddings

In [9]:
import pandas as pd
from PIL import Image

output_file = "image_embeddings_train.csv"
batch_size = 1

embeddings_df = pd.DataFrame()

for i in range(0, len(train_images_final), batch_size):
    batch_ids = train_id[i:i + batch_size]
    batch_images = train_images_final[i:i + batch_size]
    batch_labels = train_labels[i:i + batch_size]

    temp_df = pd.DataFrame({
        'id': batch_ids,
        'label': batch_labels,
    })

    for image_path in batch_images:
        image = processor(
            text=None,
            images=Image.open(image_path),
            return_tensors='pt'
        )['pixel_values'].to(device)
        img_emb = model.get_image_features(image)
        img_emb = img_emb.detach().cpu().numpy().reshape(1, -1)

        temp_df['image_embedding'] = ['\t'.join(map(str, emb)) for emb in img_emb]

    embeddings_df = pd.concat([embeddings_df, temp_df], ignore_index=True)

# save csv
embeddings_df[['id', 'label', 'image_embedding']].to_csv(output_file, index=False)




In [10]:
import pandas as pd
from PIL import Image

output_file = "image_embeddings_valid.csv"
batch_size = 1

embeddings_df = pd.DataFrame()

for i in range(0, len(valid_images_final), batch_size):
    batch_ids = valid_id[i:i + batch_size]
    batch_images = valid_images_final[i:i + batch_size]
    batch_labels = valid_labels[i:i + batch_size]

    temp_df = pd.DataFrame({
        'id': batch_ids,
        'label': batch_labels,
    })

    for image_path in batch_images:
        image = processor(
            text=None,
            images=Image.open(image_path),
            return_tensors='pt'
        )['pixel_values'].to(device)
        img_emb = model.get_image_features(image)
        img_emb = img_emb.detach().cpu().numpy().reshape(1, -1)

        temp_df['image_embedding'] = ['\t'.join(map(str, emb)) for emb in img_emb]

    embeddings_df = pd.concat([embeddings_df, temp_df], ignore_index=True)

# save csv
embeddings_df[['id', 'label', 'image_embedding']].to_csv(output_file, index=False)




In [11]:
import pandas as pd
from PIL import Image

output_file = "image_embeddings_test.csv"
batch_size = 1

embeddings_df = pd.DataFrame()

for i in range(0, len(test_images_final), batch_size):
    batch_ids = test_id[i:i + batch_size]
    batch_images = test_images_final[i:i + batch_size]
    batch_labels = test_labels[i:i + batch_size]

    temp_df = pd.DataFrame({
        'id': batch_ids,
        'label': batch_labels,
    })

    for image_path in batch_images:
        image = processor(
            text=None,
            images=Image.open(image_path),
            return_tensors='pt'
        )['pixel_values'].to(device)
        img_emb = model.get_image_features(image)
        img_emb = img_emb.detach().cpu().numpy().reshape(1, -1)

        temp_df['image_embedding'] = ['\t'.join(map(str, emb)) for emb in img_emb]

    embeddings_df = pd.concat([embeddings_df, temp_df], ignore_index=True)

# save csv
embeddings_df[['id', 'label', 'image_embedding']].to_csv(output_file, index=False)




In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [13]:
class MyDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
        self.text_embeddings = self.data['image_embedding'].apply(lambda x: list(map(float, x.split('\t')))).tolist()
        self.labels = self.data['label'].tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text_emb = torch.tensor(self.text_embeddings[index])
        label = torch.tensor(self.labels[index])
        return text_emb, label

In [14]:
train_file = 'image_embeddings_train.csv'
val_file = 'image_embeddings_valid.csv'  
test_file = 'image_embeddings_test.csv'  

train_data = MyDataset(train_file)
val_data = MyDataset(val_file) 
test_data = MyDataset(test_file)

# defining the model

In [15]:
class ClassificationModel(nn.Module):
    def __init__(self, input_size):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 6)  
    
    def forward(self, text_emb):
        x = self.fc1(text_emb)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        x = self.fc3(x)
        return x

In [16]:
model = ClassificationModel(input_size=512)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
model.to(device)
criterion.to(device)

CrossEntropyLoss()

# training

In [19]:
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = 0.0
    train_correct = 0.0
    model.train()

    for text_emb, labels in train_loader:
        text_emb = text_emb.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(text_emb)
        _, predicted = torch.max(outputs.data, 1)

        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()

    train_accuracy = train_correct / len(train_data)
    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    val_correct = 0.0

    with torch.no_grad():
        for text_emb, labels in val_loader:
            text_emb = text_emb.to(device)
            labels = labels.to(device)

            outputs = model(text_emb)
            _, predicted = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels.long())

            val_loss += loss.item()
            val_correct += (predicted == labels).sum().item()

    val_accuracy = val_correct / len(val_data)
    val_loss /= len(val_loader)

    print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

Epoch: 1/10, Train Loss: 0.8271, Train Accuracy: 0.6824, Val Loss: 0.7542, Val Accuracy: 0.7131
Epoch: 2/10, Train Loss: 0.6838, Train Accuracy: 0.7387, Val Loss: 0.7428, Val Accuracy: 0.7202
Epoch: 3/10, Train Loss: 0.6040, Train Accuracy: 0.7687, Val Loss: 0.7551, Val Accuracy: 0.7180
Epoch: 4/10, Train Loss: 0.5203, Train Accuracy: 0.8021, Val Loss: 0.8040, Val Accuracy: 0.7149
Epoch: 5/10, Train Loss: 0.4265, Train Accuracy: 0.8382, Val Loss: 0.9130, Val Accuracy: 0.7098
Epoch: 6/10, Train Loss: 0.3365, Train Accuracy: 0.8719, Val Loss: 1.0477, Val Accuracy: 0.6950
Epoch: 7/10, Train Loss: 0.2590, Train Accuracy: 0.9023, Val Loss: 1.2327, Val Accuracy: 0.6924
Epoch: 8/10, Train Loss: 0.2037, Train Accuracy: 0.9252, Val Loss: 1.4269, Val Accuracy: 0.6924
Epoch: 9/10, Train Loss: 0.1686, Train Accuracy: 0.9390, Val Loss: 1.6439, Val Accuracy: 0.6886
Epoch: 10/10, Train Loss: 0.1407, Train Accuracy: 0.9485, Val Loss: 1.8074, Val Accuracy: 0.6889


# testing

In [20]:
model.eval()
test_correct = 0.0

with torch.no_grad():
    for text_emb, labels in test_loader:
        text_emb = text_emb.to(device)
        labels = labels.to(device)

        outputs = model(text_emb)
        _, predicted = torch.max(outputs.data, 1)

        test_correct += (predicted == labels).sum().item()

test_accuracy = test_correct / len(test_data)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.6790
