In [None]:
import os
import pandas as pd

#To download the images, go to https://web.mit.edu/phillipi/Public/states_and_transformations/index.html
file_list  = os.listdir("../MIT_states/images/")
file_list

In [None]:
filtered_list = [item for item in file_list if ' ' in item]
split_items = [item.split(' ', 1) for item in filtered_list]

# Create a DataFrame with two columns: 'First' and 'Second'
df = pd.DataFrame(split_items, columns=['First', 'Second'])

df.head()

In [None]:
shape = [
    'small', 'large', 'thick', 'thin', 'coiled',  
    'cracked', 'folded', 'sliced', 'dented', 'chipped', 'shattered', 
    'diced', 'straight', 'empty', 'cut', 'pressed', 'torn', 'broken','tiny', 'huge', 'melted','draped', 'scratched'
]

color = [
    'caramelized', 'painted', 'bright', 'dark', 'unpainted', 'engraved','wet',
    'burnt','weathered','rusty', 'old','new','dry','verdant', 'clean', 'frozen', 'thawed',  'ancient','moldy', 'muddy', 'fresh'
]

texture = ['crushed', 'crumpled','splintered','wrinkled', 
        'ruffled',  'mossy',  'molten', 'windblown',  'eroded', 'pureed', 
    'crinkled', 'frayed', 'brushed', 
    'barren']

In [None]:
shape_similar = {
    'small': ['tiny', 'little'],
    'large': ['huge', 'massive', 'thick'],
    'thick': ['large', 'huge', 'massive'],
    'thin': ['slim'],
    'coiled': [],
    'cracked': ['broken', 'chipped', 'shattered', 'splintered'],
    'folded': [],
    'sliced': ['diced'],
    'dented': [],
    'chipped': ['broken', 'cracked', 'shattered', 'splintered'],
    'shattered': ['broken', 'cracked', 'chipped', 'splintered'],
    'diced': ['sliced'],
    'straight': [],
    'empty': [],
    'cut': ['torn'],
    'pressed': [],
    'torn': ['cut'],
    'broken': ['cracked', 'chipped', 'shattered', 'splintered'],
    'tiny': ['small', 'little'],
    'huge': ['large', 'massive', 'thick'],
    'melted': [],
    'draped': [],
    'scratched': ['cracked', 'chipped', 'shattered', 'broken', 'splintered']
}

color_similar = {
    'caramelized': [],
    'painted': [],
    'bright': [],
    'dark': [],
    'unpainted': [],
    'engraved': [],
    'wet': ['moist'],
    'burnt': [],
    'weathered': ['rusty', 'ancient', 'old'],
    'rusty' : ['ancient', 'weathered', 'old'],
    'old': ['ancient', 'weathered', 'rusty'],
    'new': ['fresh', 'clean'],
    'dry': [],
    'verdant': [],
    'clean': ['new', 'fresh'],
    'frozen': [],
    'thawed': [],
    'ancient': ['old', 'weathered', 'rusty'],
    'moldy': [],
    'muddy': [],
    'fresh': ['clean', 'new']
}

texture_similar = {
    'crushed': ['cracked', 'chipped', 'shattered', 'broken', 'splintered'],
    'crumpled': ['wrinkled', 'crinkled'],
    'splintered': ['cracked', 'chipped', 'shattered', 'broken', 'crushed'],
    'wrinkled': ['crumpled', 'crinkled'],
    'ruffled': ['frayed'],
    'mossy': [],
    'molten': [],
    'windblown': ['barren'],
    'eroded': [],
    'pureed': [],
    'crinkled': ['crumpled', 'wrinkled'],
    'frayed': ['ruffled'],
    'brushed': [],
    'barren': ['windblown']
}


In [None]:
combined_states = list(color_similar.keys()) + list(shape_similar.keys()) + list(texture_similar.keys())
unique_states_list = list(set(combined_states))
unique_states_list

In [None]:
df = df[df["First"].isin(unique_states_list)]

In [None]:
df

In [None]:
import glob
df.rename(columns={"First": "correct_answer", "Second": "noun"}, inplace=True)

# Base path
base_path = "../MIT_states/images/"

# Expand the DataFrame
expanded_rows = []

for _, row in df.iterrows():
    folder_path = f"{base_path}{row['correct_answer']} {row['noun']}"
    image_paths = glob.glob(f"{folder_path}/*")
    for image_path in image_paths:
        expanded_rows.append({
            "correct_answer": row["correct_answer"],
            "noun": row["noun"],
            "image_path": image_path
        })

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

In [None]:
expanded_df

In [None]:
import random
from collections import defaultdict
similar_dict = {**shape_similar, **color_similar, **texture_similar}

all_words = list(set(list(shape_similar.keys()) + list(color_similar.keys()) + list(texture_similar.keys())))

precomputed_incorrect_answers = defaultdict(dict)

for noun in expanded_df['noun'].unique():
    noun_df = expanded_df[expanded_df['noun'] == noun]
    for correct_answer in all_words:
        possible_incorrect_answers = [
            word for word in all_words 
            if word not in similar_dict[correct_answer] 
            and word != correct_answer 
            and not noun_df[noun_df['correct_answer'] == word].empty
        ]
        precomputed_incorrect_answers[noun][correct_answer] = possible_incorrect_answers


In [None]:
def generate_prompt(row):
    correct_answer = row['correct_answer']
    noun = row['noun']
    
    # Get possible incorrect answers from the precomputed dictionary
    possible_incorrect_answers = precomputed_incorrect_answers[noun].get(correct_answer, [])
    
    # Check if possible_incorrect_answers is empty
    if not possible_incorrect_answers:
        # Handle the case where no incorrect answer is found
        incorrect_answer = "default"  # You can set a more appropriate default value or handle this case differently
    else:
        # Choose an incorrect answer randomly
        incorrect_answer = random.choice(possible_incorrect_answers)
    
    # Randomize the placement of correct answer
    if random.random() > 0.5:
        prompt = f"Is this {noun} {correct_answer} or {incorrect_answer}?"
    else:
        prompt = f"Is this {noun} {incorrect_answer} or {correct_answer}?"
    
    return prompt, incorrect_answer

# Apply the function to get the prompts and incorrect answers
expanded_df[['clean_prompt', 'incorrect_answer']] = expanded_df.apply(lambda row: pd.Series(generate_prompt(row)), axis=1)

expanded_df

In [None]:
def determine_state(correct_answer):
    if correct_answer in shape_similar:
        return 'shape'
    elif correct_answer in color_similar:
        return 'color'
    elif correct_answer in texture_similar:
        return 'texture'
    else:
        return 'unknown'


expanded_df['state'] = expanded_df['correct_answer'].apply(determine_state)


In [None]:
expanded_df.tail(10)

In [None]:
expanded_df.rename(columns={"image_path": "clean_image_path"}, inplace=True)

In [None]:
def get_incorrect_image_path(row, df):
    incorrect_answer = row['incorrect_answer']
    noun = row['noun']
    
    # Filter the DataFrame to find matching incorrect answers and nouns
    matching_rows = df[(df['correct_answer'] == incorrect_answer) & (df['noun'] == noun)]
    
    # If matching rows exist, randomly select one and return its image path
    if not matching_rows.empty:
        return random.choice(matching_rows['clean_image_path'].tolist())
    return None

# Apply the function to get the incorrect image path for each row
expanded_df['corrupt_image_path'] = expanded_df.apply(lambda row: get_incorrect_image_path(row, expanded_df), axis=1)

In [None]:
expanded_df

In [None]:
expanded_df = expanded_df.drop_duplicates(["corrupt_image_path"]).drop_duplicates(["clean_image_path"])

In [None]:
def generate_corrupt_prompt(prompt):
    words = prompt.split(" ")
    noun = words[2]
    correct_adjective = words[3]
    incorrect_adjective = words[5][:-1]  # Removing the '?' at the end
    
    # Get possible replacements for correct_adjective
    possible_replacements_correct = [word for word in all_words if word not in similar_dict.get(correct_adjective, []) + [correct_adjective]]
    
    # Get possible replacements for incorrect_adjective
    possible_replacements_incorrect = [word for word in all_words if word not in similar_dict.get(incorrect_adjective, []) + [incorrect_adjective]]
    
    # Choose replacements randomly
    new_correct_adjective = random.choice(possible_replacements_correct)
    new_incorrect_adjective = random.choice(possible_replacements_incorrect)
    
    # Replace adjectives in the prompt
    new_prompt = f"Is this {noun} {new_correct_adjective} or {new_incorrect_adjective}?"
    
    return new_prompt

expanded_df['corrupt_prompt'] = expanded_df['clean_prompt'].apply(generate_corrupt_prompt)


In [None]:
expanded_df.to_csv("mit_states_cleaned.csv", index=False)