## Import House Styles and Setup Paths

In [3]:
import pandas as pd
import io
csv_data = '/Users/tawate/Documents/Architecture_Image_Modeling/data/house_style_list.csv'
df = pd.read_csv(csv_data, header=None)

# Select the first column and rows 2 to end (ignoring the header)
house_styles = df[0][1:].tolist()

DATA_PATH = '/Users/tawate/.cache/kagglehub/datasets/wwymak/architecture-dataset/versions/1/arcDataset/'
TRAINING_DATA_PATH = '/Users/tawate/.cache/kagglehub/datasets/wwymak/architecture-dataset/versions/1/arcDataset/image_class_dset/'

## Total Number of files per house style

In [4]:
import os

def count_files_in_folder(folder_path):
    """
    Counts the number of files in a specific folder.
    
    Args:
        folder_path (str): The path to the folder.
        
    Returns:
        int: The number of files in the folder, or a message if the folder
             is not found.
    """
    try:
        # Get a list of all entries (files and directories) in the folder
        entries = os.listdir(folder_path)
        
        # Count only the files using a list comprehension and os.path.isfile()
        file_count = sum(1 for entry in entries if os.path.isfile(os.path.join(folder_path, entry)))
        
        return file_count
        
    except FileNotFoundError:
        return "Folder not found"
    except Exception as e:
        return f"An error occurred: {e}"

In [7]:
# Count files in each house style folder and print the results
total_count = 0
for style in house_styles:
    file_count = count_files_in_folder(TRAINING_DATA_PATH + 'train/' + style)
    print(f"The number of files in training for '{style}' is: {file_count}")
    total_count += file_count if isinstance(file_count, int) else 0
    
print(f"Total number of files for training: {total_count}")

# Count files in each house style folder and print the results
total_count = 0
for style in house_styles:
    file_count = count_files_in_folder(TRAINING_DATA_PATH + 'test/' + style)
    print(f"The number of files in testing for '{style}' is: {file_count}")
    total_count += file_count if isinstance(file_count, int) else 0
    
print(f"Total number of files for testing: {total_count}")

The number of files in training for 'Cape Cod House Style' is: 50
The number of files in training for 'Romanesque architecture' is: 85
The number of files in training for 'Tudor Revival architecture' is: 123
The number of files in training for 'Edwardian architecture' is: 64
The number of files in training for 'Georgian architecture' is: 119
The number of files in training for 'Gothic House architecture' is: 52
The number of files in training for 'Greek Revival architecture' is: 236
The number of files in training for 'Colonial architecture' is: 133
The number of files in training for 'American craftsman style' is: 146
The number of files in training for 'American Foursquare architecture' is: 52
Total number of files for training: 1060
The number of files in testing for 'Cape Cod House Style' is: 24
The number of files in testing for 'Romanesque architecture' is: 39
The number of files in testing for 'Tudor Revival architecture' is: 55
The number of files in testing for 'Edwardian arch

## Check for Similarity between images within the same class

In [40]:
import os
from PIL import Image
import imagehash
from itertools import combinations

def find_similar_images(folder_path, hash_size=8, threshold=5):
    """
    Finds similar images in a folder using perceptual hashing.

    Args:
        folder_path (str): The path to the folder containing images.
        hash_size (int): The size of the hash. A larger size can be more
                         precise but slower.
        threshold (int): The maximum allowed difference between two hashes
                         to be considered similar. A lower value is stricter.

    Returns:
        A list of tuples, where each tuple contains the paths of two similar images.
    """
    image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

    # Dictionary to store hashes: {hash_value: [file_path1, file_path2, ...]}
    hashes = {}

    for image_path in image_files:
        try:
            with Image.open(image_path) as img:
                # Use a perceptual hash algorithm (e.g., dhash, phash)
                # dhash is often a good balance of speed and accuracy
                image_hash = imagehash.dhash(img, hash_size=hash_size)
                
                # Check for exact duplicates first
                if image_hash in hashes:
                    print(f"Exact duplicate found: '{image_path}' and '{hashes[image_hash][0]}'")
                    hashes[image_hash].append(image_path)
                else:
                    hashes[image_hash] = [image_path]

        except Exception as e:
            print(f"Could not process image '{image_path}': {e}")
    
    similar_pairs = []
    # Compare hashes of all unique pairs of images
    hash_list = list(hashes.keys())
    for hash1, hash2 in combinations(hash_list, 2):
        # Calculate the Hamming distance (difference) between the hashes
        hash_difference = hash1 - hash2
        print(hash_difference)
        if hash_difference <= threshold:
            for file1 in hashes[hash1]:
                for file2 in hashes[hash2]:
                    # Make sure not to compare the same file
                    if file1 != file2:
                        similar_pairs.append((file1, file2, hash_difference))

    return similar_pairs

In [43]:
import os
# Assuming the find_similar_images function is defined above this block.
# (from a previous response)

if __name__ == "__main__":
    # Replace with the path to your image folder
    DATA_PATH = DATA_PATH

    # Define the list of subfolders you want to process
    house_styles = house_styles

    if not os.path.isdir(DATA_PATH):
        print(f"Error: The folder '{DATA_PATH}' does not exist.")
        exit()

    # Define the output text file
    output_file = "/Users/tawate/Documents/Architecture_Image_Modeling/data/similar_images.txt"

    # Get a list of all subfolders within DATA_PATH
    all_subfolders = [os.path.join(DATA_PATH, entry) for entry in os.listdir(DATA_PATH)
                      if os.path.isdir(os.path.join(DATA_PATH, entry))]

    # Filter the list to only include folders in house_styles
    filtered_subfolders = [folder for folder in all_subfolders
                           if os.path.basename(folder) in house_styles]

    if not filtered_subfolders:
        print(f"No matching subfolders found in '{DATA_PATH}' based on the 'house_styles' list.")
        exit()

    # Open the text file in write mode
    with open(output_file, 'w') as f:
        print(f"Writing results to '{output_file}'...")
        f.write("--- Image Similarity Report ---\n\n")

        # Iterate through each filtered subfolder and run the similarity check
        for folder in filtered_subfolders:
            folder_name = os.path.basename(folder)
            print(f"\nProcessing folder: {folder_name}")
            f.write(f"--- Folder: {folder_name} ---\n")

            # The find_similar_images function is called on the subfolder
            similar_images = find_similar_images(folder)

            if similar_images:
                f.write("Found Similar Image Pairs:\n")
                for file1, file2, diff in similar_images:
                    # Write the pair and their difference to the file
                    f.write(f"  '{os.path.basename(file1)}' is similar to '{os.path.basename(file2)}' (Difference: {diff})\n")
            else:
                f.write("No similar images found in this folder.\n")
            
            f.write("\n") # Add a blank line between folders for readability

    print(f"\nProcessing complete. All results have been saved to '{output_file}'.")

Writing results to '/Users/tawate/Documents/Architecture_Image_Modeling/data/similar_images.txt'...

Processing folder: Tudor Revival architecture
29
26
33
35
25
37
32
34
26
24
36
24
29
40
33
27
30
32
20
36
34
32
38
32
36
33
29
27
26
30
31
29
40
26
32
32
30
38
37
32
27
35
32
39
27
35
25
34
31
29
41
26
30
20
30
18
25
35
26
29
28
30
28
36
30
26
21
32
30
29
26
20
30
23
31
33
27
29
32
30
33
32
25
25
32
37
41
33
25
36
33
38
29
27
31
33
28
28
30
29
44
32
29
33
30
30
30
40
32
32
20
26
29
28
19
25
36
35
24
33
32
25
26
32
18
41
29
32
33
37
32
30
32
38
34
25
32
41
34
29
28
31
27
31
24
31
34
26
33
31
30
25
22
36
31
26
34
28
29
25
25
23
32
28
30
36
37
33
35
35
29
31
26
29
30
38
33
29
19
37
27
37
23
25
33
30
40
26
29
31
26
32
31
27
37
25
31
35
34
27
30
38
25
32
28
26
38
33
28
34
28
29
33
31
27
31
28
28
39
28
27
33
39
27
31
29
32
29
29
38
29
31
29
28
28
28
24
30
27
33
26
31
28
40
33
28
38
26
32
31
32
41
28
36
28
26
23
31
37
30
33
31
34
34
33
27
33
31
29
29
31
25
36
33
32
30
41
28
27
34
37
36
31
29
3