In [None]:
from pathlib import Path

import cv2
import pandas as pd
import pytesseract
from matplotlib import pyplot as plt
from itertools import chain

path_screenshots = Path.home() / "Pictures/Screenshots"
path_out = Path("./out/members.csv")
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

In [None]:
def parse_base(items):
    out = list()

    items = filter(None, items.split("\n"))

    for item in items:
        out.append(item)

    return out


def parse_names(names):
    out = list()

    names_iter = iter(filter(None, names.split("\n")))

    for name, level in zip(*[names_iter] * 2):
        # remove errant dash and underscores due to profile frame
        if name[:1] == "-" or name[:1] == "_":
            name = name[1:]

        # trim parens
        level = level[9:-1]

        out.append((name, level))

    return zip(*out)


def parse_clans(clans):
    out = list()

    clans = filter(None, clans.split("\n"))

    for clan in clans:
        if clan == "Cowkings Desc...":
            clan = "Cowkings Descendants"
        out.append(clan)

    return out


def parse_roles(roles):
    return parse_base(roles)


def parse_contributions(contributions):
    return parse_base(contributions)


def process_member_table(path):
    # Load image
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)

    # Generate and process image crop for names and levels
    img_crop_names = img[607:1715, 1215:1700]
    img_crop_names = cv2.threshold(
        img_crop_names, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]
    # plt.imshow(img_crop_names)
    names = pytesseract.image_to_string(img_crop_names)
    names, levels = parse_names(names)

    # Generate and process image crop for clans
    img_crop_clans = img[607:1715, 1700:2250]
    img_crop_clans = cv2.threshold(
        img_crop_clans, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]
    # plt.imshow(img_crop_clans)
    clans = pytesseract.image_to_string(img_crop_clans)
    clans = parse_clans(clans)

    # Generate and process image crop for roles
    img_crop_roles = img[607:1715, 2250:2600]
    img_crop_roles = cv2.threshold(
        img_crop_roles, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]
    # plt.imshow(img_crop_roles)
    roles = pytesseract.image_to_string(img_crop_roles)
    roles = parse_roles(roles)

    # Generate and process image crop for contribution levels
    img_crop_contributions = img[607:1715, 2800:3100]
    img_crop_contributions = cv2.threshold(
        img_crop_contributions, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
    )[1]
    # Resize crop to improve accuracy of 1-2 digit words
    (h, w) = img_crop_contributions.shape[:2]
    img_crop_contributions = cv2.resize(img_crop_contributions, (w * 2, h * 2))
    # plt.imshow(img_crop_contributions)
    contributions = pytesseract.image_to_string(
        img_crop_contributions, config="--psm 6 digits"
    )
    contributions = parse_contributions(contributions)

    data = [names, levels, clans, roles, contributions]

    # Ensure we parsed 5 rows from each crop
    if not all(len(x) == 5 for x in data):
        raise Exception("Mismatched column lengths detected.")

    return pd.DataFrame(
        list(zip(*data)), columns=["Name", "Level", "Clan", "Role", "Contribution"]
    )


def get_attrib_crop(path, y1, y2, x1, x2):
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    img_crop = img[y1:y2, x1:x2]
    return cv2.threshold(img_crop, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]


def process_member_attributes(paths):
    attrib = dict()

    # Process primary attributes
    img_crop_attrib_1 = get_attrib_crop(paths[0], 530, 1660, 3000, 3220)
    # plt.imshow(img_crop_attrib_1)
    attrib_1 = pytesseract.image_to_string(img_crop_attrib_1, config="--psm 6 digits")
    attrib_1 = [int(a) for a in parse_base(attrib_1)]
    attrib |= dict(
        zip(
            [
                "Damage",
                "Life",
                "Strength",
                "Intelligence",
                "Fortitude",
                "Vitality",
                "Willpower",
            ],
            attrib_1,
        )
    )

    # Process secondary attributes: Combat Rating ... Resistance
    img_crop_attrib_2 = get_attrib_crop(paths[1], 720, 1450, 3000, 3220)
    # plt.imshow(img_crop_attrib_2)
    attrib_2 = pytesseract.image_to_string(img_crop_attrib_2, config="--psm 6 digits")
    attrib_2 = [int(a) for a in parse_base(attrib_2)]
    attrib |= dict(
        zip(
            [
                "CombatRating",
                "Armor",
                "ArmorPenetration",
                "Potency",
                "Resistance",
            ],
            attrib_2,
        )
    )

    # Process additional attributes: Accuracy Rating ... Attack Speed
    img_crop_attrib_3 = get_attrib_crop(paths[2], 600, 1490, 3000, 3220)
    # plt.imshow(img_crop_attrib_3)
    attrib_3 = pytesseract.image_to_string(img_crop_attrib_3)
    attrib_3 = [float(a[:-1]) / 100 for a in parse_base(attrib_3)]
    attrib |= dict(
        zip(
            [
                "AccuracyRating",
                "EvasionRating",
                "CriticalHitChance",
                "CriticalResistance",
                "CriticalHitDamage",
                "AttackSpeed",
            ],
            attrib_3,
        )
    )

    # Process additional attributes: Cooldown Reduction ... Life Regeneration
    img_crop_attrib_4 = get_attrib_crop(paths[3], 690, 1590, 3000, 3220)
    # plt.imshow(img_crop_attrib_4)
    attrib_4 = pytesseract.image_to_string(img_crop_attrib_4)
    attrib_4 = [float(a[:-1]) / 100 for a in parse_base(attrib_4)]
    attrib |= dict(
        zip(
            [
                "CooldownReduction",
                "MovementSpeed",
                "BlockChance",
                "LifeDrain",
                "CheatDeath",
                "LifeRegeneration",
            ],
            attrib_4,
        )
    )

    # Process additional attributes: Damage Increase ... Resonance
    img_crop_attrib_5 = get_attrib_crop(paths[4], 1060, 1630, 3000, 3220)
    # plt.imshow(img_crop_attrib_5)
    attrib_5 = pytesseract.image_to_string(img_crop_attrib_5)
    attrib_5 = parse_base(attrib_5)
    for i, a in enumerate(attrib_5[:-1]):
        attrib_5[i] = float(a[:-1]) / 100
    attrib_5[-1] = int(attrib_5[-1])
    attrib |= dict(
        zip(
            [
                "DamageIncrease",
                "DefenseIncrease",
                "MagicFind",
                "Resonance",
            ],
            attrib_5
        )
    )

    return attrib


def run(path):
    df1 = pd.DataFrame()

    # Process screenshots in batches of 26: 1 screenshot for 5 rows from the member table
    # and 5 screenshots per member for attributes. The first batch is only 21 screenshots
    # due to the current user always being the first member and opening their profile
    # from the member table is unsupported.
    is_first_page = True
    
    screenshots = sorted(path.glob("*.png"))
    screenshot_batches = chain(zip(*[iter(screenshots[:21])] * 21), zip(*[iter(screenshots[21:])] * 26))

    for member_table_screenshot, *member_profile_screenshots in screenshot_batches:
        df2 = process_member_table(member_table_screenshot)

        member_attrib = [{}] if is_first_page else []
        for i, member_profile_screenshot_batch in enumerate(
            zip(*[iter(member_profile_screenshots)] * 5)
        ):
            member_attrib.append(
                process_member_attributes(member_profile_screenshot_batch)
            )
        df3 = pd.DataFrame(member_attrib)

        df2 = df2.join(df3)

        df1 = pd.concat([df1, df2], ignore_index=True)

        is_first_page = False

    # TODO: de-dupe
    return df1

df = run(path_screenshots)


In [None]:
df

In [None]:
df.to_csv(path_out, index=False)