In [1]:
from datasets import load_dataset
from PIL import Image
import pandas as pd
import cv2, random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
IRFL_images = load_dataset("lampent/IRFL", data_files='IRFL_images.zip')['train']

In [4]:
IRFL_metaphors_dataset = load_dataset("lampent/IRFL", 'metaphors-dataset')['dataset']
pd.DataFrame(IRFL_metaphors_dataset).head()

Unnamed: 0,phrase,figurative_type,source,uuid,category,theme
0,a lion on the battlefield,metaphor,https://www.israelhayom.com/2022/03/01/ukraini...,1009925977381951166573538219201192200312184397...,Figurative,
1,a lion on the battlefield,metaphor,https://www.army.mil/article/260659/soldier_fo...,1110158887118147260462183682336548897929003447...,Figurative,
2,a lion on the battlefield,metaphor,https://www.spokesman.com/stories/2012/aug/03/...,1150022408148171978092024654350648179128902573...,Figurative,
3,a lion on the battlefield,metaphor,https://www.theguardian.com/books/2016/feb/26/...,4334616426942719694991676075800911426738633635...,Figurative,
4,a lion on the battlefield,metaphor,https://www.polygon.com/22396791/battlefield-6...,6641158141547602068103775945956449252322278422...,Figurative,


In [5]:
# functions for local run borrowed from https://colab.research.google.com/drive/1RfcUhBTHvREx5X7TMY5UAgMYX8NMKy7u?usp=sharing#scrollTo=EGj78A5X8PUp 

def get_image_path_from_folder(image_name):
  image_folder_path = r'C:\devel\IRLM\assets\D_images'
  return f'{image_folder_path}\\{image_name.split(".")[0] + ".jpeg"}'

def get_image_path_from_hugginface_cache(image_name):
    chached_image_path = IRFL_images[0]['image'].filename
    chached_image_name = chached_image_path.split('/')[-1]
    return chached_image_path.replace(chached_image_name, str(image_name).split('.')[0] + '.jpeg')

def get_image(image_name):
  image_path = get_image_path_from_hugginface_cache(image_name)
  return Image.open(image_path)

IRFL_idioms_dataset = load_dataset("lampent/IRFL", 'idioms-dataset')['dataset']
IRFL_metaphors_dataset = load_dataset("lampent/IRFL", 'metaphors-dataset')['dataset']
IRFL_similes_dataset = load_dataset("lampent/IRFL", 'similes-dataset')['dataset']

In [None]:
def get_data(task_name, idioms=False):
    ''' 
    Function that creates dictionaries of the phrase, the image, and their corresponding label in order to use them for
    creating the csv dataset files suitable for training VisualBERT.

    Arguments: 
        task_name: name of the task to create a dataset from
        idioms: boolean argument for creating the idiom dataset
    '''
    images = task_name['uuid']
    category = task_name['category']
    phrases = task_name['phrase']
    # for idioms we need the literal candidates for the 'No Category' category
    if idioms == True:
        literal_candidates = task_name['literal_candidate']

    dictionary_list = []
    for idx, element in enumerate(phrases):
        if category[idx] != None:
            if category[idx] == 'No Category':
                if literal_candidates[idx] == True:
                    dictionary = {'phrase': element, 'image': images[idx], 'category': 'Literal'}
                    dictionary_list.append(dictionary)
                else:
                    dictionary = {'phrase': element, 'image': images[idx], 'category': 'Figurative'}
                    dictionary_list.append(dictionary)
            else:
                dictionary = {'phrase': element, 'image': images[idx], 'category': category[idx]}
                dictionary_list.append(dictionary)

    return dictionary_list

metaphor_data = get_data(IRFL_metaphors_dataset)
idiom_data = get_data(IRFL_idioms_dataset, idioms=True)
simile_data = get_data(IRFL_similes_dataset)

In [None]:
def create_data_table(dictionary_list, data_kind):
    column_names=["phrase", "image", "category"]
    
    df = pd.DataFrame(dictionary_list, columns=column_names)
    filepath =  'data/' + data_kind + '.csv'
    df.to_csv(filepath, index=False)

create_data_table(metaphor_data, "metaphor_data")
create_data_table(idiom_data, "idiom_data")
create_data_table(simile_data, "simile_data")


In [9]:
def add_random_images(csv_file_target, csv_file_source, path_to_file):
    target = pd.read_csv(csv_file_target)
    source = pd.read_csv(csv_file_source)
    
    phrases = target['phrase'].to_list()
    images = source['image'].to_list()
    unique_phrases = list(set(phrases))
    
    dictionary_list = []
    for phrase in unique_phrases:
        random_images = random.sample(images, 5)
        for image in random_images:
            dictionary = {'phrase': phrase, 'image': image, 'category': 'Random'}
            dictionary_list.append(dictionary)
    
    new_rows = pd.DataFrame(dictionary_list)
    new_df = pd.concat([target, new_rows], ignore_index=True)
    filepath = 'data/' + path_to_file + '.csv'
    new_df.to_csv(filepath, index=False)

add_random_images('data/idiom_data.csv', 'data/simile_data.csv','random_idiom_data')
add_random_images('data/simile_data.csv', 'data/metaphor_data.csv','random_simile_data')
add_random_images('data/metaphor_data.csv', 'data/idiom_data_filtered.csv','random_metaphor_data')

In [12]:
# removing idiom image path that the dataloader could not open

df = pd.read_csv('data/random_idiom_data.csv')
images = df['image']

for index, row in df.iterrows():
    img_path = get_image_path_from_hugginface_cache(row['image'])
    try:
        img = cv2.imread(img_path)
        if img is None:
            print(f"Error: Unable to open image: {img_path}")
            df.drop(index, inplace=True)
    except Exception as e:
        print(f"Error: {e}")

df.to_csv('data/random_idiom_data_filtered.csv', index=False)

Error: Unable to open image: /home/gusfilvi@GU.GU.SE/.cache/huggingface/datasets/downloads/extracted/4cab8a0ee45e0254f30c474e03c078b67fa082cc3b8d33b54999068424dc5db9/images/59184347145905525702013901366754898821749091427836426068064546156198032288188.jpeg


In [14]:
df = pd.read_csv('data/metaphor_data.csv')
df.head()

Unnamed: 0,phrase,image,category
0,a lion on the battlefield,1009925977381951166573538219201192200312184397...,Figurative
1,a lion on the battlefield,1110158887118147260462183682336548897929003447...,Figurative
2,a lion on the battlefield,1150022408148171978092024654350648179128902573...,Figurative
3,a lion on the battlefield,4334616426942719694991676075800911426738633635...,Figurative
4,a lion on the battlefield,6641158141547602068103775945956449252322278422...,Figurative
