
# Goal of this notebook is to anonymize the file names and crop the real data so that it's harder to tell which photo is associated 

In [89]:
import numpy as np
import math
import json
import pandas as pd

def crop_image(image, coordinates, expansion_percent=0):
    """
    Crop a region from an image based on normalized coordinates and optionally expand the bounding box.

    Args:
        image (numpy.ndarray): The input image as a NumPy array.
        coordinates (list): The normalized coordinates [xmin, ymin, width, height] of the region to crop.
        expansion_percent (float): The percentage to expand the bounding box by (default: 0).

    Returns:
        numpy.ndarray: The cropped image as a NumPy array.

    """

    # Check if the image shape and coordinates are valid
    image_height, image_width, _ = image.shape
    if not (0 <= coordinates[0] <= 1 and 0 <= coordinates[1] <= 1 and 0 <= coordinates[2] <= 1 and 0 <= coordinates[3] <= 1):
        raise ValueError("Invalid normalized coordinates. Expected values in the range [0, 1].")

    # Convert normalized coordinates to pixel coordinates
    xmin = int(coordinates[0] * image_width)
    ymin = int(coordinates[1] * image_height)
    width = int(coordinates[2] * image_width)
    height = int(coordinates[3] * image_height)
    xmax = xmin + width
    ymax = ymin + height

    # Expand the bounding box coordinates by a percentage
    expansion_amount = int(width * expansion_percent)
    xmin -= expansion_amount
    ymin -= expansion_amount
    xmax += expansion_amount
    ymax += expansion_amount

    # Set the minimum and maximum values of coordinates to the image boundaries
    xmin = max(0, xmin)
    ymin = max(0, ymin)
    xmax = min(image_width, xmax)
    ymax = min(image_height, ymax)

    # Crop the region from the image
    cropped_image = image[ymin:ymax, xmin:xmax, :]

    return cropped_image

def crop_bottom(image, crop_height=0.05, crop_width=0.05):
    image_height, image_width, _ = image.shape
    new_image_height = math.floor(image_height - image_height*crop_height)
    new_image_width = math.floor(image_width - image_width*crop_width)
    coordinates = [0, 0, new_image_width/image_width, new_image_height/image_height]
    return crop_image(image, coordinates)
    

In [109]:
base_dir = '/home/azureuser/cloudfiles/code/projects/clir/hackathon51/real_vs_fake/image_data/'
generated_images = base_dir + '/test_fake_generated/'
generated_json = base_dir + '/generation_prompts.json'

inpainted_images = base_dir + '/test_fake_inpainting/'
inpainted_json = base_dir + '/inpainting_prompts.json'

real_images = base_dir + '/curated/'
real_file = 'real_image_loss_descriptions_full.json'
real_json = base_dir + real_file

output_dir = base_dir + '/anonymized_test/'

In [110]:
generated_df = json.load(open(generated_json,'r'))
generated_df = pd.DataFrame(generated_df)
generated_df['identifier'] = generated_df['identifier'].astype(str)
inpainted_df = json.load(open(inpainted_json,'r'))
inpainted_df = pd.DataFrame(inpainted_df)
inpainted_df['identifier'] = inpainted_df['identifier'].astype(str)


In [114]:
real_df = json.load(open(real_json, 'r'))
real_df = pd.DataFrame.from_dict(real_df, columns=['prompt'], orient='index')
real_df = real_df.reset_index()
real_df.columns = ['identifier', 'prompt']
real_df['source'] = real_file

In [123]:
import glob
from pathlib import Path
from PIL import Image
import tqdm

def generated_name_to_df(generated_files, int_first=False):
    file_list = []
    for i in generated_files:
        file_name = Path(i).name
        identifier = file_name.split('_')[0]
        if int_first:
            identifier = int(identifier)
        file_list.append({'file_path': i, 'file_name': str(file_name), 'identifier': str(identifier)})
    return pd.DataFrame(file_list)

def crop_resave(final_df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    new_name_list = []
    print(final_df.shape)
    for i in tqdm.tqdm(range(final_df.shape[0])):
        
        img = np.array(Image.open(final_df.loc[i,'file_path']))
        file_type = final_df.loc[i, 'file_type']
        if file_type in ['real','inpainted']:
            img = crop_bottom(img, crop_height=0.05, crop_width=0)
        anon_name = f'{i}_img.png'
        im = Image.fromarray(img).resize((512, 512))
        im.save(output_dir + '/' + anon_name)
        final_df.loc[i, 'anon_name'] = anon_name
    return final_df
    
generated_files = glob.glob(generated_images + '/*.png')
inpainted_files = glob.glob(inpainted_images + '/*.png')
real_files = glob.glob(real_images + '/*.jpg')

gen_img_to_df = generated_name_to_df(generated_files, int_first=True)
gen_img_to_df['file_type'] = 'generated'
inpainted_img_to_df = generated_name_to_df(inpainted_files, int_first=True)
inpainted_img_to_df['file_type'] = 'inpainted'
real_img_to_df = generated_name_to_df(real_files)
real_img_to_df['file_type'] = 'real'

gen_df2 = generated_df.merge(gen_img_to_df, on='identifier')
inpainted_df2 = inpainted_df.merge(inpainted_img_to_df, on='identifier')    
real_df2 = real_df.merge(real_img_to_df, on='identifier')

final_df = pd.concat([gen_df2, inpainted_df2, real_df2], axis=0).reset_index(drop=True)

In [124]:
print(gen_df2.shape, inpainted_df2.shape, real_df2.shape)

(25, 6) (25, 8) (18, 6)


In [125]:
cols = ['prompt','source','identifier','file_path','file_name', 'file_type']
final_df = final_df.loc[:, cols]

In [126]:
final_df = final_df.sample(final_df.shape[0]).reset_index(drop=True)
final_df

Unnamed: 0,prompt,source,identifier,file_path,file_name,file_type
0,Collision-front-impact: The front end of the c...,prompts/summary_inpainting1.csv,105,/home/azureuser/cloudfiles/code/projects/clir/...,105_01.png,generated
1,"The car is completely damaged, with a crushed ...",prompts/damages.txt,2,/home/azureuser/cloudfiles/code/projects/clir/...,002_02.png,generated
2,intensely damaged bumper,,49,/home/azureuser/cloudfiles/code/projects/clir/...,049_intensely_damaged_bumper.png,inpainted
3,The car is severely damaged with a smashed fro...,prompts/damages.txt,4,/home/azureuser/cloudfiles/code/projects/clir/...,004_00.png,generated
4,The driver of the vehicle ran a red light and ...,real_image_loss_descriptions_full.json,0354892760101035,/home/azureuser/cloudfiles/code/projects/clir/...,0354892760101035_175780808_frontdrivercorner_C...,real
...,...,...,...,...,...,...
63,terrifically smashed vehicle,,582,/home/azureuser/cloudfiles/code/projects/clir/...,582_terrifically_smashed_vehicle.png,inpainted
64,The vehicle was invloved in a multi-car collis...,real_image_loss_descriptions_full.json,8668873630000001,/home/azureuser/cloudfiles/code/projects/clir/...,8668873630000001___905252218353.jpg,real
65,The vehicle was rear-ended at a red light. The...,real_image_loss_descriptions_full.json,8675484330000001,/home/azureuser/cloudfiles/code/projects/clir/...,8675484330000001___905263690132.jpg,real
66,Tag: Flood damage. Car engine stopped working ...,prompts/summary_inpainting1.csv,116,/home/azureuser/cloudfiles/code/projects/clir/...,116_00.png,generated


In [127]:
crop_resave(final_df, output_dir)

# final_df.loc[0, 'file_path']

(68, 6)


100%|██████████| 68/68 [00:14<00:00,  4.80it/s]


Unnamed: 0,prompt,source,identifier,file_path,file_name,file_type,anon_name
0,Collision-front-impact: The front end of the c...,prompts/summary_inpainting1.csv,105,/home/azureuser/cloudfiles/code/projects/clir/...,105_01.png,generated,0_img.png
1,"The car is completely damaged, with a crushed ...",prompts/damages.txt,2,/home/azureuser/cloudfiles/code/projects/clir/...,002_02.png,generated,1_img.png
2,intensely damaged bumper,,49,/home/azureuser/cloudfiles/code/projects/clir/...,049_intensely_damaged_bumper.png,inpainted,2_img.png
3,The car is severely damaged with a smashed fro...,prompts/damages.txt,4,/home/azureuser/cloudfiles/code/projects/clir/...,004_00.png,generated,3_img.png
4,The driver of the vehicle ran a red light and ...,real_image_loss_descriptions_full.json,0354892760101035,/home/azureuser/cloudfiles/code/projects/clir/...,0354892760101035_175780808_frontdrivercorner_C...,real,4_img.png
...,...,...,...,...,...,...,...
63,terrifically smashed vehicle,,582,/home/azureuser/cloudfiles/code/projects/clir/...,582_terrifically_smashed_vehicle.png,inpainted,63_img.png
64,The vehicle was invloved in a multi-car collis...,real_image_loss_descriptions_full.json,8668873630000001,/home/azureuser/cloudfiles/code/projects/clir/...,8668873630000001___905252218353.jpg,real,64_img.png
65,The vehicle was rear-ended at a red light. The...,real_image_loss_descriptions_full.json,8675484330000001,/home/azureuser/cloudfiles/code/projects/clir/...,8675484330000001___905263690132.jpg,real,65_img.png
66,Tag: Flood damage. Car engine stopped working ...,prompts/summary_inpainting1.csv,116,/home/azureuser/cloudfiles/code/projects/clir/...,116_00.png,generated,66_img.png


In [133]:
final_df.to_csv(output_dir + '/../anonymized_table_full.csv')
final_df.loc[:,['prompt', 'anon_name']].to_csv(output_dir + '/../anonymized_table_partial.csv')
