investigate the nature of text prompts of different text-to-image datasets

- diffusiondb
- pixart
- laion
- pd12m

check if the prompts 
- are None
- contain special characters

from that, we will devise proper preprocessing steps for each dataset.

In [2]:
from datasets import load_dataset
import pandas as pd

## 1. diffusiondb

In [3]:
# diffusiondb
df = pd.read_csv('../../../data/processed/text_prompts/shorten_csvs/diffusiondb.csv')
print(df.shape)
pd.set_option('display.max_colwidth', None)
df.head()

(1819808, 2)


Unnamed: 0,image_name,prompt
0,3ccdc650-871a-4ad9-9bf2-dc475b83ed32.webp,"beautiful porcelain ivory fair face woman biomechanical cyborg, close - up, sharp focus, studio light, iris van herpen haute couture headdress made of rhizomorphs, daisies, brackets, colorful corals, fractal mushrooms, puffballs, octane render, ultra sharp, 8 k"
1,1f1fcb70-63a4-40b1-ada9-2c15fb2ca10a.webp,"complex 3 d render hyper detailed ultra sharp futuristic beautiful biomechanical humanoid woman with porcelain ivory face, medium shot portrait, close - up, filigree lace, iris van herpen cyberpunk daisies corals haute couture headdress with rhizomorph finials spires, brackets, fractal embroidered puffballs, octane render, 8 k"
2,a3154865-5d4c-4ba7-bcb2-7c1b4ed205e5.webp,"complex 3 d render hyper detailed ultra sharp scifi futuristic beautiful biomechanical humanoid woman with porcelain ivory face, medium shot rim light portrait, close - up, iris van herpen cyberpunk daisies corals haute couture headdress with rhizomorph finials spires, brackets, fractal embroidered puffballs, filigree lace, octane render, 8 k"
3,d6e3e37e-d426-41cf-b765-9c9872c1cdc5.webp,"complex 3 d render hyper detailed ultra sharp beautiful futuristic stunning biomechanical humanoid woman with porcelain ivory face, medium shot portrait, close - up, filigree lace, iris van herpen cyberpunk daisies corals haute couture headdress with rhizomorph finials spires, brackets, fractal embroidered puffballs, octane render, 8 k"
4,8763be47-d192-4542-a04a-f8e085273290.webp,"complex 3 d render hyper detailed ultra sharp beautiful futuristic stunning biomechanical humanoid woman with porcelain ivory face, medium shot portrait, close - up, filigree lace, iris van herpen daisies colras cyberpunk haute couture headdress with rhizomorph finials spires, brackets, fractal embroidered puffballs, octane render, 8 k"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1819808 entries, 0 to 1819807
Data columns (total 2 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   image_name  object
 1   prompt      object
dtypes: object(2)
memory usage: 27.8+ MB


In [5]:
df.describe()

Unnamed: 0,image_name,prompt
count,1819808,1819804
unique,1819808,1819804
top,3ccdc650-871a-4ad9-9bf2-dc475b83ed32.webp,"beautiful porcelain ivory fair face woman biomechanical cyborg, close - up, sharp focus, studio light, iris van herpen haute couture headdress made of rhizomorphs, daisies, brackets, colorful corals, fractal mushrooms, puffballs, octane render, ultra sharp, 8 k"
freq,1,1


In [6]:
# Check for NaN values in prompts
nan_count = df['prompt'].isna().sum()
print(f"Number of NaN prompts: {nan_count}")
print(f"Percentage of NaN prompts: {(nan_count / len(df)) * 100:.2f}%")

if nan_count > 0:
    print("\nRows with NaN prompts:")
    print(df[df['prompt'].isna()])


Number of NaN prompts: 4
Percentage of NaN prompts: 0.00%

Rows with NaN prompts:
                                        image_name prompt
1621     25d6b4fc-abaf-4601-bebf-0319313ec6d7.webp    NaN
143763   59a6c0cd-025b-4bf6-8e02-986748f22464.webp    NaN
339365   9aa57667-a7af-403f-b4df-91093c1204d4.webp    NaN
1802371  a899419d-0e76-4bf1-928c-063defedd20f.webp    NaN


In [7]:
# Create a subset where all prompts are not NaN
df_not_nan = df[df['prompt'].notna()]

# Save the new DataFrame to a CSV file
df_not_nan.to_csv('../../../data/processed/text_prompts/shorten_csvs/diffusiondb_not_nan.csv', index=False)

In [16]:
df_not_nan.shape

(1819804, 2)

In [14]:
# Check for special characters in prompts
special_chars = '[@_!#$%^&*()<>?/\|}{~:]'
has_special = df_not_nan['prompt'].str.contains(f'[{special_chars}]', regex=True)

print(f"Number of prompts with special characters: {has_special.sum()}")
print(f"Percentage of prompts with special characters: {(has_special.sum() / len(df_not_nan)) * 100:.2f}%")

# Display some examples of prompts with special characters
print("\nExample prompts with special characters:")
print(df_not_nan[has_special]['prompt'].head())


Number of prompts with special characters: 20
Percentage of prompts with special characters: 0.00%

Example prompts with special characters:
398615     beautiful full-body portrait commission of a [male furry!!! anthro!!! albino mountain lion fursona!!!] [wearing a yellow button-down shirt, olive green slacks] [in a Old-timey saloon]. Atmospheric. Renowned character illustration by greg rutkowski, thomas kindkade, alphonse mucha, loish, norman rockwell. detailed, inked, western comic book art
457431                                                                                                                                                                                                                                                                                                                                                      love :^]
457432                                                                                                                                           

In [17]:
# Remove entries with special characters
df_cleaned = df_not_nan[~has_special]

# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('../../../data/processed/text_prompts/shorten_csvs/diffusiondb_no_special.csv', index=False)


## 2. pixart

In [3]:
import pandas as pd
pixart_df = pd.read_csv('../../../data/processed/text_prompts/shorten_csvs/pixart.csv')
print(pixart_df.shape)
pixart_df.head()

(11529794, 2)


Unnamed: 0,key,prompt
0,sa_3631,"The image features a man in military fatigues,..."
1,sa_3196,"The image features a large body of water, like..."
2,sa_2071,"The image features a large jetliner, specifica..."
3,sa_886,"The image features a large, modern, and colorf..."
4,sa_7088,The image features a group of men dressed in c...


In [2]:
pixart_hf[-1]

{'__key__': 'sa_431185',
 '__url__': '/home/thivt1/.cache/huggingface/datasets/downloads/ac28231c978dd5b982e1fa9be23f87fa3923d4b83e0f17c277700c7240824e35',
 'txt': 'The image features a large, colorful boat docked at a pier with a red and green roof. The scene is set in a harbor or marina, where several boats are docked. The style of the image is a black and white photo, which adds a timeless and classic feel to the scene. The presence of a person on the dock suggests that this is a busy harbor, and the boats are likely used for recreational or commercial purposes. The overall atmosphere of the image is lively and bustling, with the boats and people interacting in a dynamic environment.'}

In [28]:
# Convert to dataframe with __key__, __url__ and txt columns
# pixart = pd.DataFrame({
#     '__key__': [item['__key__'] for item in pixart_hf],
#     '__url__': [item['__url__'] for item in pixart_hf],
#     'prompt': [item['txt'] for item in pixart_hf]
# })

## 3. laion

In [6]:
laion = load_dataset("dclure/laion-aesthetics-12m-umap")
laion = laion['train']

In [11]:
print(len(laion))
laion[10]

12096809


{'URL': 'https://render.fineartamerica.com/images/rendered/search/print/images-medium-5/1-lake-in-winter-crater-lake-crater-panoramic-images.jpg',
 'TEXT': 'Lake In Winter, Crater Lake, Crater Art Print',
 'WIDTH': 400.0,
 'HEIGHT': 266.0,
 'similarity': 0.318795382976532,
 'punsafe': 1.3206558833189774e-06,
 'pwatermark': 1.0,
 'AESTHETIC_SCORE': 6.085381984710693,
 'hash': -8877775580192596443,
 'x_nn60': -20.535648345947266,
 'y_nn60': 2.3713226318359375,
 'x_nn10': 23.585365295410156,
 'y_nn10': -6.865099906921387,
 'x_nn30': 20.44169044494629,
 'y_nn30': -14.880393981933594,
 '__index_level_0__': -8877775580192596443}

In [11]:
# Convert to dataframe with URL, TEXT, AESTHETIC_SCORE columns
laion_df = pd.DataFrame({
    'URL': [item['URL'] for item in laion],
    'TEXT': [item['TEXT'] for item in laion],
    'AESTHETIC_SCORE': [item['AESTHETIC_SCORE'] for item in laion]
})


KeyboardInterrupt: 

## 4. pd12m

In [3]:
from datasets import load_dataset
pd12m = load_dataset("Spawning/PD12M")

Resolving data files:   0%|          | 0/126 [00:01<?, ?it/s]

In [4]:
print(pd12m.shape)
pd12m['id'].nunique(), pd12m['url'].nunique()

{'train': (12400094, 9)}


KeyError: 'id'

In [33]:
pd12m['train'][0]

{'id': '05e34aaf1f36189543b12a447dbe8f81',
 'url': 'https://pd12m.s3.us-west-2.amazonaws.com/images/02fe431e-10d5-5416-afdd-c6811343222e.jpeg',
 'caption': 'The image shows a miniature portrait of a man in a blue coat, framed in a photo frame against a white background.',
 'width': 2718,
 'height': 3088,
 'mime_type': 'image/jpeg',
 'hash': 'cd7d10443fe252bb1041e8fd7c6417f1',
 'license': 'https://creativecommons.org/publicdomain/zero/1.0/',
 'source': 'National Museum in Warsaw'}

: 

## 5. journeydb

In [None]:
from datasets import load_dataset
ds = load_dataset("JourneyDB/JourneyDB", token="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")