In [5]:
import json
import urllib.request
from urllib.error import HTTPError
import time
import math

In [6]:
franchise = 'mythology'
franchise_name = 'Mythology'
page_limit = 25

franchises_file = '../wildcards/Franchises.yaml'
franchises_text = """
'<[model][illu]>source_anime, __Characters/${FRANCHISE}__**(yellow skin, orange skin, red skin:1.5)**':
  Description:
    - ${FRANCHISE} Characters
  Tags:
    - Characters
    - ${FRANCHISE}
"""
characterFile = f'../wildcards/characters/{franchise_name}.txt'

In [7]:
def myurlopen(url, max_wait=300):
    i = 2
    while True:
        try:
            req = urllib.request.Request(
                                            url,
                                            data=None,
                                            headers={
                                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0'
                                            }
                                        )
            r = urllib.request.urlopen(req)
            return r
        except HTTPError as e:
            print(e)
            sleep_time = 2 + 2**i / 1000
            if sleep_time > max_wait:
                print('ABORT!')
                raise TimeoutError(f'Timed out with max wait time of {max_wait} s')
            print(f'Some HTTP error occurred. Waiting for {sleep_time}...')
            time.sleep(sleep_time)
            i += 1
            continue


def cosine_similarity(tag1_count, tag2_count, tag12_count):
    return tag12_count / math.sqrt(tag1_count * tag2_count)

def get_e621_count(search_string):
    url = f"https://search.yiff.today/count?query={search_string}"
    print(f'now working on url: {url}')
    response = myurlopen(url)
    return json.loads(response.read())['count']

def get_similarity(tag1, tag2):
    tag1_count = get_e621_count(tag1)
    #tag2_count = get_e621_count(tag2)
    combined_count = get_e621_count(f'{tag1}+{tag2}')
    #return cosine_similarity(tag1_count, tag2_count, combined_count)
    return combined_count / tag1_count

In [8]:
jdata = []
for page in range(1, page_limit):
    url = f"https://e621.net/posts.json?limit=320&tags={franchise}&page={page}"
    print(f'now working on url: {url}')
    response = myurlopen(url)
    rjson = json.loads(response.read())
    if not rjson['posts']:
        break
    jdata += rjson['posts']
    time.sleep(1)

now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=1
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=2
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=3
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=4
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=5
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=6
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=7
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=8
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=9
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=10
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=11
now working on url: https://e621.net/posts.json?limit=320&tags=mythology&page=12
now working on url: https://e621.net/

In [9]:
character_tag_dict = {}

for post in jdata:
    post_tags = post['tags']
    characters = post_tags['character']

    for c in characters:
        if c in character_tag_dict.keys():
            character_tag_dict[c] += 1
        else:
            character_tag_dict[c] = 1

In [10]:
reject_if_below_threshold = 0.7
reject_if_above_threshold = 0.6
franchise_overlap_threshold = 0.8
total_post_threshold = 15
tag_overlap = 0.66

In [11]:
filtered_dict = {}

for c in character_tag_dict.keys():
    if character_tag_dict[c] < total_post_threshold:
        continue
    filtered_dict[c] = character_tag_dict[c]

In [12]:
accept_criteria = [
    'female',
    '1girl',
    '1girls'
]

reject_criteria = [
    'male',
    '1boy'
]

In [13]:
accepted_characters = []

for c in filtered_dict.keys():
    hits = 0
    similarity = get_similarity(c, franchise)
    if similarity < franchise_overlap_threshold:
        hits += 1
    for a in accept_criteria:
        similarity = get_similarity(c, a)
        if similarity < reject_if_below_threshold:
            hits += 1
    for a in reject_criteria:
        similarity = get_similarity(c, a)
        if similarity > reject_if_above_threshold:
            hits += 1
    if hits == 0:
        accepted_characters.append(c)

now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)+mythology
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)+female
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)+1girl
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)+1girls
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)+male
now working on url: https://search.yiff.today/count?query=princess_celestia_(mlp)
now working on url: https://search.yiff.today/count?query=princ

In [14]:
accepted_characters

['princess_celestia_(mlp)',
 'loona_(helluva_boss)',
 'twilight_sparkle_(mlp)',
 'princess_cadance_(mlp)',
 'rainbow_dash_(mlp)',
 'malice_loona_(mangakitsune2)',
 'rarity_(mlp)',
 'scootaloo_(mlp)',
 'sweetie_belle_(mlp)',
 'princess_luna_(mlp)',
 'agape_(petruz)',
 'sunset_shimmer_(eg)',
 'trixie_(mlp)',
 'smolder_(mlp)',
 'cynder',
 'derpy_hooves_(mlp)',
 'princess_ember_(mlp)',
 'sisu_(ratld)',
 'lilana_rose']

In [15]:
characters = []

for c in accepted_characters:

    character = c.strip().replace('_', ' ').replace('(', '\\(').replace(')', '\\)')
    the_franchise = franchise.strip().replace('_', ' ').replace('(', '\\(').replace(')', '\\)')
    final_string = f'({character}:1.3), {the_franchise}\n'
    characters.append(final_string)

In [16]:
if len(characters) > 0:
    with open(characterFile, 'w+') as f:
        f.writelines( characters )
    with open(franchises_file, 'a+') as f:
        new_text = franchises_text.replace('${FRANCHISE}', franchise_name)
        f.write(new_text)