In [1]:
import pandas as pd
import os
from pathlib import Path

working_dir = '../yas-dump/v1.3/dumps-v1.3-4k-relabel/'
p = Path(working_dir)

df = pd.DataFrame(columns=['Label', 'IsValid', 'Position', 'DataId','ArtifactId', 'PathImageProcessed', 'PathImageRaw', 'PathLabel'],)

# Read file into df
for fpath in p.iterdir() :
    fname = fpath.name
    if '.txt' in fname:
        with open(fpath, 'r') as f:
            label = f.readline()
            
            raw_image_name = fname.replace('.txt', '.png')
            processed_image_name = 'p_' + raw_image_name
            
            dataId = fname.removesuffix('.txt')
            artId = int(fname.removesuffix('.txt').split('_')[-1])
            position = '_'.join(dataId.split('_')[0:-1])
            
            row = {
                'IsValid': 'Y',
                'Label': label, 
                'PathImageProcessed': str(fpath.parent / processed_image_name), 
                'PathImageRaw': str(fpath.parent / raw_image_name), 
                'PathLabel': str(fpath),
                'ArtifactId': artId,
                'DataId': dataId,
                'Position' : position,
            }
            
            # 直接放弃无效数据 （不管false positive）
            if position == 'equip':
                if '已装备' not in label:
                    continue
                
                # 跳过一些坏数据
                if '神里绫已利已装备' in label:# 老版本中 "珊瑚宫心海" 在4k分辨率下会识别成这
                    continue
            
            # 跳过副词条没有"+"的情形，这说明改副词条不存在，是词条裁切问题
            if position in {'sub_stat_1', 'sub_stat_2', 'sub_stat_3', 'sub_stat_4'}:
                if '+' not in label:
                    continue
                    
                    
            df = df.append(row, ignore_index=True)
            # print(label)


In [2]:
# 粗筛选

for index, row in df.iterrows():
    # 去除错误的已装备
    if row['Position'] == 'equip':
        if '已装备' not in row['Label']:
            row['IsValid'] = 'N'
            
    # 去除错误的副词条
    if row['Position'] in {'sub_stat_1', 'sub_stat_2', 'sub_stat_3', 'sub_stat_4'}:
        if '+' not in row['Label']:
            row['IsValid'] = 'N'
            
        
# Save df to excel
df.to_excel('tmp.xlsx')

In [3]:
# Use openpylx to load Images to Excel
from openpyxl import Workbook, load_workbook
from openpyxl.drawing.image import Image

from PIL import Image as PLImage
import PIL.ImageOps    

wb = load_workbook('tmp.xlsx')
# Get first sheet
ws = wb.active

path_column = 'G'

for i in range(2, 40000):
    path_processed_img = ws[f"{path_column}{i}"].value    
    if path_processed_img:        
        img = Image(path_processed_img)
        ws.add_image(img, f"A{i}")

# 导出excel，用来手工标注（修正label）
wb.save('out.xlsx')



In [28]:
# 读取手工标注好的excel，生成dataset

import pandas as pd
from PIL import Image
import torchvision.transforms as transforms
import torch

# 读取手工标注好的excel
df = pd.read_excel('out.xlsx')
# df.to_csv('data-v13.csv')

# Next create the data set
dataset_name = 'realdata-labeled'

def img_to_tensor(img: Image):
    tensor = transforms.ToTensor()(img)
    return tensor
    

# g means grayscale, b means binarized

# Grayscale not supported for now, because size is not uniform
# x_4k_g = []
# x_900p_g = []

x_4k_b = []
x_900p_b = []

y = []
for index, row in df.iterrows():
    # print(row['Label'])
    y.append(row['Label'])
    
    # path_4k_g = row['PathImageRaw']
    path_4k_b = row['PathImageProcessed']
    
    # path_900p_g = path_4k_g.replace('4k-relabel', '900p')
    path_900p_b = path_4k_b.replace('4k-relabel', '900p')
    
    # tensor_4k_g = img_to_tensor(Image.open(path_4k_g))
    # tensor_900p_g = img_to_tensor(Image.open(path_900p_g))
    
    tensor_4k_b = img_to_tensor(Image.open(path_4k_b))
    tensor_900p_b = img_to_tensor(Image.open(path_900p_b))
    
    # x_4k_g.append(torch.unsqueeze(tensor_4k_g, dim=0))
    # x_900p_g.append(torch.unsqueeze(tensor_900p_g, dim=0))
    
    x_4k_b.append(torch.unsqueeze(tensor_4k_b, dim=0))
    x_900p_b.append(torch.unsqueeze(tensor_900p_b, dim=0))
    
    # print(path_900p_b)
    
torch.save(y, f"{dataset_name}-label.pt")

# torch.save(torch.cat(x_4k_g, dim=0), f"{dataset_name}-4k-g.pt")
# torch.save(torch.cat(x_900p_g, dim=0), f"{dataset_name}-900p-g.pt")

torch.save(torch.cat(x_4k_b, dim=0), f"{dataset_name}-4k-b.pt")
torch.save(torch.cat(x_900p_b, dim=0), f"{dataset_name}-900p-b.pt")


torch.Size([1, 1, 63, 622])