In [67]:
import json
import urllib.request
from urllib.error import HTTPError
import time
import math

In [68]:
franchise = 'japanese_mythology'
franchise_name = 'Mythology'
page_limit = 25

franchises_file = '../wildcards/Franchises.yaml'
franchises_text = """
'<[model][pony]>source_furry, __Species/${FRANCHISE}__':
  Description:
    - ${FRANCHISE} Characters
  Tags:
    - Myspecies
    - ${FRANCHISE}
"""
characterFile = f'../wildcards/species/{franchise_name}.txt'

In [69]:
def myurlopen(url, max_wait=300):
    i = 2
    while True:
        try:
            req = urllib.request.Request(
                                            url,
                                            data=None,
                                            headers={
                                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0'
                                            }
                                        )
            r = urllib.request.urlopen(req)
            return r
        except HTTPError as e:
            print(e)
            sleep_time = 2 + 2**i / 1000
            if sleep_time > max_wait:
                print('ABORT!')
                raise TimeoutError(f'Timed out with max wait time of {max_wait} s')
            print(f'Some HTTP error occurred. Waiting for {sleep_time}...')
            time.sleep(sleep_time)
            i += 1
            continue


def cosine_similarity(tag1_count, tag2_count, tag12_count):
    return tag12_count / math.sqrt(tag1_count * tag2_count)

def get_e621_count(search_string):
    url = f"https://search.yiff.today/count?query={search_string}"
    print(f'now working on url: {url}')
    response = myurlopen(url)
    return json.loads(response.read())['count']

def get_similarity(tag1, tag2):
    tag1_count = get_e621_count(tag1)
    #tag2_count = get_e621_count(tag2)
    combined_count = get_e621_count(f'{tag1}+{tag2}')
    #return cosine_similarity(tag1_count, tag2_count, combined_count)
    return combined_count / tag1_count

In [70]:
jdata = []
for page in range(1, page_limit):
    url = f"https://e621.net/posts.json?limit=320&tags={franchise}&page={page}"
    print(f'now working on url: {url}')
    response = myurlopen(url)
    rjson = json.loads(response.read())
    if not rjson['posts']:
        break
    jdata += rjson['posts']
    time.sleep(1)

now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=1
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=2
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=3
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=4
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=5
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=6
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=7
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=8
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=9
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=10
now working on url: https://e621.net/posts.json?limit=320&tags=japanese_mythology&page=11
now working on url:

In [71]:
character_tag_dict = {}

for post in jdata:
    post_tags = post['tags']
    characters = post_tags['species']

    for c in characters:
        if c in character_tag_dict.keys():
            character_tag_dict[c] += 1
        else:
            character_tag_dict[c] = 1

In [72]:
franchise_overlap_threshold = 0.9
total_post_threshold = 100

In [73]:
filtered_dict = {}

for c in character_tag_dict.keys():
    if character_tag_dict[c] < total_post_threshold:
        continue
    filtered_dict[c] = character_tag_dict[c]

In [74]:
accepted_characters = []

for c in filtered_dict.keys():
    similarity = get_similarity(c, franchise)
    if similarity < franchise_overlap_threshold:
        continue
    accepted_characters.append(c)

now working on url: https://search.yiff.today/count?query=bear
now working on url: https://search.yiff.today/count?query=bear+japanese_mythology
now working on url: https://search.yiff.today/count?query=foo_dog
now working on url: https://search.yiff.today/count?query=foo_dog+japanese_mythology
now working on url: https://search.yiff.today/count?query=komainu
now working on url: https://search.yiff.today/count?query=komainu+japanese_mythology
now working on url: https://search.yiff.today/count?query=mammal
now working on url: https://search.yiff.today/count?query=mammal+japanese_mythology
now working on url: https://search.yiff.today/count?query=yokai
now working on url: https://search.yiff.today/count?query=yokai+japanese_mythology
now working on url: https://search.yiff.today/count?query=canid
now working on url: https://search.yiff.today/count?query=canid+japanese_mythology
now working on url: https://search.yiff.today/count?query=canine
now working on url: https://search.yiff.today

In [75]:
accepted_characters

['komainu',
 'yokai',
 'oni',
 'kappa',
 'nekomata',
 'kamaitachi',
 'bakeneko',
 'tengu']

In [76]:
characters = []

for c in accepted_characters:

    character = c.strip().replace('_', ' ').replace('(', '\\(').replace(')', '\\)')
    the_franchise = franchise.strip().replace('_', ' ').replace('(', '\\(').replace(')', '\\)')
    final_string = f'({character}:1.3), {the_franchise}\n'
    characters.append(final_string)

In [77]:
if len(characters) > 0:
    with open(characterFile, 'a+') as f:
        f.writelines( characters )
    with open(franchises_file, 'a+') as f:
        new_text = franchises_text.replace('${FRANCHISE}', franchise_name)
        f.write(new_text)