In [None]:
from pathlib import Path

import cv2
import pandas as pd
import pytesseract
from matplotlib import pyplot as plt
from itertools import chain
from tqdm.notebook import tqdm

path_screenshots = Path.home() / "Pictures/Screenshots/20220905"
path_out = Path("./out/members.csv")
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

clan_map = {"Cowkings Desc": "Cowkings Descendants"}

In [None]:
def parse_base(items):
    out = list()

    items = filter(None, items.split("\n"))

    for item in items:
        out.append(item)

    return out


def parse_names(names):
    out = list()

    names_iter = iter(filter(None, names.split("\n")))

    for name, level in zip(*[names_iter] * 2):
        # remove errant dash and underscores due to profile frame
        if name[:1] == "-" or name[:1] == "_":
            name = name[1:]

        # trim parens
        level = level[9:-1]

        out.append((name, int(level)))

    return list(zip(*out))


def parse_clans(clans):
    out = list()

    clans = filter(None, clans.split("\n"))

    for clan in clans:
        clan = clan_map.get(clan, clan)
        out.append(clan)

    return out


def parse_roles(roles):
    return parse_base(roles)


def parse_contributions(contributions):
    return [int(x) for x in parse_base(contributions)]


def parse_member_data(img, parser, config=None):
    member_data = parser(pytesseract.image_to_string(img, config=config))

    def verify_data_len(data):
        if (data_len := len(data)) != 5:
            raise Exception(f"Expected 5 members but found {data_len}: {data}.")

    # Support verifying length of member data containing tuples.
    if any(isinstance(x, tuple) for x in member_data):
        [verify_data_len(x) for x in member_data]
    else:
        verify_data_len(member_data)

    return member_data


def process_member_crop(img, y1, y2, x1, x2, parser, config=None):
    img_crop = img[y1:y2, x1:x2]
    img_tresh = cv2.threshold(
        img_crop, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]

    try:
        member_data = parse_member_data(img_tresh, parser, config)
    except:
        # Retry at 2x scale to handle crops with mostly single digit words.
        (h, w) = img_crop.shape[:2]
        img_crop = cv2.resize(img_crop, (w * 2, h * 2))
        img_tresh = cv2.threshold(
            img_crop, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        )[1]
        member_data = parse_member_data(img_tresh, parser, config)

    return member_data


def process_members(path):
    # Load image
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)

    # Generate and process image crop for names and levels
    names, levels = process_member_crop(img, 607, 1715, 1215, 1700, parse_names)

    # Generate and process image crop for clans
    clans = process_member_crop(
        img,
        607,
        1715,
        1700,
        2250,
        parse_clans,
        config=r'--psm 6 -c tessedit_char_whitelist="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "'
    )

    # Generate and process image crop for roles
    roles = process_member_crop(img, 607, 1715, 2250, 2600, parse_roles)

    # Generate and process image crop for contribution levels
    contributions = process_member_crop(
        img,
        607,
        1715,
        2800,
        3100,
        parse_contributions,
        config="--psm 6 -c tessedit_char_whitelist=0123456789",
    )

    # Merge data.
    data = [names, levels, clans, roles, contributions]

    return pd.DataFrame(
        list(zip(*data)), columns=["Name", "Level", "Clan", "Role", "Contribution"]
    )


def parse_attrib(img, expected_count, config=None):
    attrib = pytesseract.image_to_string(img, config=config)
    attrib = parse_base(attrib)
    if (attrib_len := len(attrib)) != expected_count:
        # plt.imshow(img, cmap="gray")
        raise Exception(
            f"Expected {expected_count} attributes but found {attrib_len}: {attrib}."
        )
    return attrib


def process_attrib_crop(path, y1, y2, x1, x2, expected_count, config=None):
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    img_crop = img[y1:y2, x1:x2]
    img_thresh = cv2.threshold(
        img_crop, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]

    try:
        attrib = parse_attrib(img_thresh, expected_count, config)
    except:
        (h, w) = img_crop.shape[:2]
        img_crop = cv2.resize(img_crop, (w * 2, h * 2))
        img_thresh = cv2.threshold(
            img_crop, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        )[1]
        attrib = parse_attrib(img_thresh, expected_count, config)

    return attrib


def process_member_attributes(paths):
    attrib = dict()

    # Process primary attributes
    attrib_1_keys = [
        "Damage",
        "Life",
        "Strength",
        "Intelligence",
        "Fortitude",
        "Vitality",
        "Willpower",
    ]
    attrib_1 = process_attrib_crop(
        paths[0],
        530,
        1660,
        3000,
        3210,
        len(attrib_1_keys),
        config="--psm 6 -c tessedit_char_whitelist=0123456789",
    )
    attrib_1 = [int(a) for a in attrib_1]
    attrib |= dict(zip(attrib_1_keys, attrib_1))

    # Process secondary attributes: Combat Rating ... Resistance
    attrib_2_keys = [
        "CombatRating",
        "Armor",
        "ArmorPenetration",
        "Potency",
        "Resistance",
    ]
    attrib_2 = process_attrib_crop(
        paths[1],
        720,
        1450,
        3000,
        3210,
        len(attrib_2_keys),
        config="--psm 6 -c tessedit_char_whitelist=0123456789",
    )
    attrib_2 = [int(a) for a in attrib_2]
    attrib |= dict(zip(attrib_2_keys, attrib_2))

    # Process additional attributes: Accuracy Rating ... Attack Speed
    attrib_3_keys = [
        "AccuracyRating",
        "EvasionRating",
        "CriticalHitChance",
        "CriticalResistance",
        "CriticalHitDamage",
        "AttackSpeed",
    ]
    attrib_3 = process_attrib_crop(
        paths[2],
        600,
        1490,
        3000,
        3210,
        len(attrib_3_keys),
        config="--psm 6 -c tessedit_char_whitelist=0123456789.",
    )
    attrib_3 = [float(a) / 100 for a in attrib_3]
    attrib |= dict(zip(attrib_3_keys, attrib_3))

    # Process additional attributes: Cooldown Reduction ... Life Regeneration
    attrib_4_keys = [
        "CooldownReduction",
        "MovementSpeed",
        "BlockChance",
        "LifeDrain",
        "CheatDeath",
        "LifeRegeneration",
    ]
    attrib_4 = process_attrib_crop(
        paths[3],
        690,
        1590,
        3000,
        3210,
        len(attrib_4_keys),
        config="--psm 6 -c tessedit_char_whitelist=0123456789.",
    )
    attrib_4 = [float(a) / 100 for a in attrib_4]
    attrib |= dict(zip(attrib_4_keys, attrib_4))

    # Process additional attributes: Damage Increase ... Resonance
    attrib_5_keys = [
        "DamageIncrease",
        "DefenseIncrease",
        "MagicFind",
        "Resonance",
    ]
    attrib_5 = process_attrib_crop(
        paths[4],
        1060,
        1630,
        3000,
        3210,
        len(attrib_5_keys),
        config="--psm 6 -c tessedit_char_whitelist=0123456789.",
    )
    for i, a in enumerate(attrib_5[0:3]):
        attrib_5[i] = float(a) / 100
    attrib_5[3] = int(attrib_5[3])
    attrib |= dict(zip(attrib_5_keys, attrib_5))

    return attrib


def run(path):
    df1 = pd.DataFrame()

    # Process screenshots in batches of 26: 1 screenshot for 5 rows from the member table
    # and 5 screenshots per member for attributes. The first batch is only 21 screenshots
    # due to the current user always being the first member and opening their profile
    # from the member table is unsupported.
    is_first_page = True

    screenshots = sorted(path.glob("*.png"))
    screenshot_batches = chain(
        zip(*[iter(screenshots[:21])] * 21), zip(*[iter(screenshots[21:])] * 26)
    )

    for member_table_screenshot, *member_profile_screenshots in tqdm(
        screenshot_batches, total=300 / 5, desc="Member pages"
    ):
        try:
            df2 = process_members(member_table_screenshot)
        except Exception as e:
            print("Error processing file: " + str(member_table_screenshot))
            raise e

        member_attrib = [{}] if is_first_page else []
        for member_profile_screenshot_batch in tqdm(
            zip(*[iter(member_profile_screenshots)] * 5),
            total=(4 if is_first_page else 5),
        ):
            try:
                member_attrib.append(
                    process_member_attributes(member_profile_screenshot_batch)
                )
            except Exception as e:
                print(
                    "Error processing file(s): "
                    + ", ".join([str(s) for s in member_profile_screenshot_batch])
                )
                raise e

        df3 = pd.DataFrame(member_attrib)
        df2 = df2.join(df3)
        df1 = pd.concat([df1, df2], ignore_index=True)

        is_first_page = False

    # De-dupe to handle the case when there are less than 300 members.
    df1 = df1.drop_duplicates("Name", ignore_index=True)

    return df1


df = run(path_screenshots)


In [None]:
df.to_csv("../out/members.csv", index=False)