In [1]:
import numpy as np
from PIL import Image
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertConfig, BertModel
from transformers import CLIPProcessor, CLIPModel
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

In [11]:
import requests
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

In [10]:
image = Image.open('./data/data/' + str(1) + '.jpg')
image = image.resize((224,224),Image.ANTIALIAS)
image = np.asarray(image, dtype = 'float32')
inputs = processor(text=["That is positive", "That is negative", "That is neutral"], images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
print(probs)

tensor([[0.2994, 0.6470, 0.0536]], grad_fn=<SoftmaxBackward>)


In [14]:
images = []
descriptions = []
emotions = []
emotion_dic = {"positive":0, "negative":1, "neutral":2}
e_dataframe = pd.read_csv("./data/train.txt")
pre_trained = 'bert-base-uncased'
token = BertTokenizer.from_pretrained(pre_trained)

for i in range(e_dataframe.shape[0]):
    try:
        idx = e_dataframe.iloc[i]['guid']
        emotion = e_dataframe.iloc[i]['tag']
        image = Image.open('./data/data/' + str(idx) + '.jpg')
        with open('./data/data/' + str(idx) + '.txt', encoding='gbk') as fp:
            description = fp.read()
        images.append(image)
        descriptions.append(description)
        emotions.append(emotion_dic[emotion])
    except:
        continue

In [37]:
correct_num = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
for i in range(len(emotions)):
    img = images[i]
    inputs = processor(text=["That is positive", "That is negative", "That is neutral"], images=img, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image 
    probs = logits_per_image.softmax(dim=1) 
    correct_num += int(np.argmax(probs.tolist())==emotions[i])

In [38]:
print(correct_num/len(emotions))

0.4216054013503376


In [61]:
correct_num = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
for i in range(len(emotions)):
    img = images[i]
    txt = descriptions[i][:35]
    inputs = processor(text=["it's nice", "it's sad", "it's ok"], images=img, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image 
    probs = logits_per_image.softmax(dim=1) 
    correct_num += int(np.argmax(probs.tolist())==emotions[i])

In [62]:
print(correct_num/len(emotions))

0.5731432858214554
