In [87]:
from pathlib import Path

import cv2
import pandas as pd
import pytesseract
from matplotlib import pyplot as plt

# Tested only on screenshots exported as PNGs
path_screenshots = Path('./in')

In [86]:
# Coordinates tested on an iPad Air 4th generation, YMMV
y1 = 520
y2 = 1300

x1_name = 850
x2_name = 1150

x1_clan = 1150
x2_clan = 1550

x1_role = 1550
x2_role = 1850

x1_contribution = 1850
x2_contribution = 2320


def parse_names(names):
    # 'Saahir\nRank: 60(188)\n\nTheDarknessZ1\nRank: 60(187)\n\nHammer\nRank: 60(182)\n\nWizzex\nRank: 60(185)\n\nCritRubs\nRank: 60(177)\n'
    out = list()

    names = filter(None, names.split('\n'))

    for name, level in zip(*[iter(names)]*2):
        # remove errant dash and underscores due to profile frame
        if name[:1] == "-" or name[:1] == "_":
            name = name[1:]

        out.append((name, level[9:12]))

    return out


def parse_clans(clans):
    # 'Truck It\n\nTruck It\n\nCowkings Desc...\n\nTrauma\n\nCowkings Desc...\n'
    out = list()

    clans = filter(None, clans.split('\n'))

    for clan in clans:
        if clan == "Cowkings Desc...":
            clan = "Cowkings Descendants"
        out.append(clan)

    return out


def parse_roles(roles):
    # 'Officer\n\nClan Leader\n\nClan Leader\n\nOfficer\n\nOfficer\n'
    out = list()

    roles = filter(None, roles.split('\n'))

    for role in roles:
        out.append(role)

    return out


def parse_contributions(contributions):
    # '525\n\n400\n\n810\n\n520\n\n'
    out = list()

    contributions = filter(None, contributions.split('\n'))

    for contribution in contributions:
        out.append(contribution)

    return out


def parse(names, clans, roles, contributions):
    names_levels = parse_names(names)
    clans = parse_clans(clans)
    roles = parse_roles(roles)
    contributions = parse_contributions(contributions)

    names, levels = zip(*names_levels)

    data = [names, levels, clans, roles, contributions]

    if not all(len(x) == len(names) for x in data):
        # print(names)
        # print(clans)
        # print(roles)
        # print(contributions)
        raise Exception("Mismatched column lengths detected.")

    df = pd.DataFrame(
        list(zip(*data)), columns=['Name', 'Level', 'Clan', 'Role', 'Contribution'])

    return df


def extract_table(path):
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)

    img_crop_names = img[y1:y2, x1_name:x2_name]
    img_crop_names = cv2.threshold(img_crop_names, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    # plt.imshow(img_crop_names)

    img_crop_clans = img[y1:y2, x1_clan:x2_clan]
    img_crop_clans = cv2.threshold(img_crop_clans, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    # plt.imshow(img_crop_clans)

    img_crop_roles = img[y1:y2, x1_role:x2_role]
    img_crop_roles = cv2.threshold(img_crop_roles, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    # plt.imshow(img_crop_roles)

    # Resize the crop to improve accuracy of 1-2 digit words
    img_crop_contributions = img[y1:y2, x1_contribution:x2_contribution]
    img_crop_contributions = cv2.threshold(img_crop_contributions, 55, 255, cv2.THRESH_BINARY_INV)[1]
    (h, w) = img_crop_contributions.shape[:2]
    img_crop_contributions = cv2.resize(img_crop_contributions, (w*2, h*2))
    # plt.imshow(img_crop_contributions)

    names = pytesseract.image_to_string(img_crop_names)
    clans = pytesseract.image_to_string(img_crop_clans)
    roles = pytesseract.image_to_string(img_crop_roles)
    contributions = pytesseract.image_to_string(img_crop_contributions, config="--psm 6 digits")

    try:
        df = parse(names, clans, roles, contributions)
    except Exception as e:
        print("Error processing file: " + str(path))
        raise (e)

    return df


def run(path):
    df1 = pd.DataFrame()

    for screenshot_path in sorted(path.glob("*.PNG")):
        df2 = extract_table(screenshot_path)
        df1 = pd.concat([df1, df2], ignore_index=True)

    # Handle case where the last screenshot overlaps the penultimate screenshot
    df = df.drop_duplicates("Name", ignore_index=True)
    return df1


In [None]:

df = run(path_screenshots)

In [88]:
df.to_csv('./out/members.csv', index=False)