In [1]:
import psycopg2, os, subprocess
import pandas as pd

## Notebook Purpose
<b> This is notebook number 1 </b>

This notebook looks into the postgres tables public.Image, public.Tags, and a few others to gather training data for all models in the mixture including the NLP transformers, the ViT transformers, and the traditional CV models

### Notebook Order
1. getData
2. downloadData
3. trainResNetModel | trainPromptTransformerClassifier | trainViTClassifier
4. localMixtureEval

In [2]:
# Path to the shell script
script_path = './creds/load_env.sh'

# Run the script and capture the output
proc = subprocess.Popen(['/bin/bash', script_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()

if proc.returncode != 0:
    print(f"Error sourcing .zshrc: {stderr.decode('utf-8')}")
else:
    # Parse the output and set the environment variables
    for line in stdout.decode('utf-8').splitlines():
        key, _, value = line.partition("=")
        # Remove the surrounding quotes from the value
        if value.startswith('"') and value.endswith('"'):
            value = value[1:-1]
        os.environ[key] = value

# Verify the environment variable is loaded
URL = os.getenv('REMOTE_POSTGRES_URL')  # Replace 'MY_VARIABLE' with your variable name to check

# print(URL)

In [3]:
conn = psycopg2.connect(URL)
cur = conn.cursor()

In [6]:
##tags we want
styles = ["anime", "photorealistic", "cartoon", "modern art", "realistic"]
subjects = ["man", "woman", "animal", "child"]

queries = []
for style in styles:
    for subject in subjects:
        if subject == 'child':
            where_clause = f"i.meta->>'prompt' LIKE '%{subject}%' AND it.tags LIKE '%{style}%'"
        else:
            where_clause = f"it.tags LIKE '%{style}%' AND it.tags LIKE '%{subject}%'"

        sql_query = f"""
        WITH ImageTags AS (
          SELECT
            toi."imageId",
            string_agg(t.name, ', ') AS tags
          FROM "TagsOnImage" toi
          JOIN "Tag" t ON t.id = toi."tagId"
          WHERE NOT toi.disabled
            AND toi.source != 'Rekognition'
          GROUP BY toi."imageId"
        )

        -- Subquery for PG
        (SELECT
          CONCAT('https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/', i.url, '/width=450/', i.id, '.jpg') AS download_url,
          i."url",
          i."id",
          'PG' AS original_level,
          it.tags,
          i.meta->>'prompt' AS prompt,
          '{style}' AS style,
          '{subject}' AS subject
        FROM "Image" i
        JOIN ImageTags it ON i."id" = it."imageId"
        WHERE i."nsfwLevel" = 1
          AND i.meta->>'prompt' IS NOT NULL
          AND {where_clause}
        LIMIT 1000)

        UNION ALL

        -- Subquery for PG13
        (SELECT
          CONCAT('https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/', i.url, '/width=450/', i.id, '.jpg') AS download_url,
          i."url",
          i."id",
          'PG13' AS original_level,
          it.tags,
          i.meta->>'prompt' AS prompt,
          '{style}' AS style,
          '{subject}' AS subject
        FROM "Image" i
        JOIN ImageTags it ON i."id" = it."imageId"
        WHERE i."nsfwLevel" = 2
          AND i.meta->>'prompt' IS NOT NULL
          AND {where_clause}
        LIMIT 1000)

        UNION ALL

        -- Subquery for R
        (SELECT
          CONCAT('https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/', i.url, '/width=450/', i.id, '.jpg') AS download_url,
          i."url",
          i."id",
          'R' AS original_level,
          it.tags,
          i.meta->>'prompt' AS prompt,
          '{style}' AS style,
          '{subject}' AS subject
        FROM "Image" i
        JOIN ImageTags it ON i."id" = it."imageId"
        WHERE i."nsfwLevel" = 4
          AND i.meta->>'prompt' IS NOT NULL
          AND {where_clause}
        LIMIT 1000)

        UNION ALL

        -- Subquery for X
        (SELECT
          CONCAT('https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/', i.url, '/width=450/', i.id, '.jpg') AS download_url,
          i."url",
          i."id",
          'X' AS original_level,
          it.tags,
          i.meta->>'prompt' AS prompt,
          '{style}' AS style,
          '{subject}' AS subject
        FROM "Image" i
        JOIN ImageTags it ON i."id" = it."imageId"
        WHERE i."nsfwLevel" = 8
          AND i.meta->>'prompt' IS NOT NULL
          AND {where_clause}
        LIMIT 1000)

        UNION ALL

        -- Subquery for XXX
        (SELECT
          CONCAT('https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/', i.url, '/width=450/', i.id, '.jpg') AS download_url,
          i."url",
          i."id",
          'XXX' AS original_level,
          it.tags,
          i.meta->>'prompt' AS prompt,
          '{style}' AS style,
          '{subject}' AS subject
        FROM "Image" i
        JOIN ImageTags it ON i."id" = it."imageId"
        WHERE i."nsfwLevel" = 16
          AND i.meta->>'prompt' IS NOT NULL
          AND {where_clause}
        LIMIT 1000);
        """
        queries.append(sql_query)


In [7]:
# Execute each query and store results in a list
results = []
for index, query in enumerate(queries):
    print(f"working on query # {index} out of {len(queries)}")
    result = pd.read_sql_query(query, conn)
    results.append(result)

# Combine all results into a single DataFrame
image_prompt_tag_data = pd.concat(results, ignore_index=True)

working on query # 0 out of 20


  result = pd.read_sql_query(query, conn)


working on query # 1 out of 20
working on query # 2 out of 20
working on query # 3 out of 20


### Notes about queries

- Use below with original or updated_query -> the query we have in here is advanced, so we don't use this code
# image_with_ids = pd.read_sql_query(sql_query, conn)

In [None]:
image_prompt_tag_data.head()

Unnamed: 0,download_url,url,id,label,tags,prompt
0,https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...,a36fbc42-8cfa-47bb-95f8-91df2aa11ab8,1186294,PG,"anime, woman, blonde hair, blue eyes, chair, c...","<lora:minigirls-000002:1>, 1girl, tiny, minigi..."
1,https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...,b86782d4-adc2-49d4-3c85-11349297d900,129043,PG,"woman, general purpose, digital art, digital i...",A happy little boy playing with toys in a park...
2,https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...,defc97ed-e845-4a76-ec9b-9ce277eb2600,517280,PG,"man, woman, anime coloring, bangs, black-frame...","<conanAndhaibara>, 1girl, 1boy, brown hair, gl..."
3,https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...,2b8347cb-ab5a-4569-a1eb-ab9932c657a0,13552448,PG,"woman, solo, black hair, bandana, black eyes, ...","source_anime, score_9, score_8_up, score_7_up,..."
4,https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7...,ec396562-e65b-460c-357f-3cbac36beb00,129048,PG,"general purpose, digital art, digital illustra...",A happy little boy playing with toys in class ...


In [None]:
image_prompt_tag_data.groupby('label')['id'].count()

label
PG      18
PG13     1
R        1
Name: id, dtype: int64

In [None]:
image_prompt_tag_data.to_csv('multiquery_image_prompt_tag_data.csv', index=False)