# Asset 추출 및 매핑

이 노트북은 AWS Asset 패키지에서 추출된 아이콘들을 매핑하고 검증합니다.

## 개요

- AWS Asset 패키지에서 아이콘 추출
- Coarse/Fine 클래스 매핑
- 매핑 결과 검증 및 통계

**원본 노트북**: `coarse-fine-classses.ipynb`


In [1]:
import os
from pathlib import Path
import re

import pandas as pd

# 데이터 루트
DATA_ROOT = Path("data")

# Asset-Package 디렉터리 자동 탐색
asset_roots = list(DATA_ROOT.glob("Asset-Package*"))
if not asset_roots:
    raise RuntimeError("data/ 아래에 Asset-Package* 디렉터리가 없습니다. ZIP 먼저 풀어주세요.")

ASSET_ROOT = asset_roots[0]

print("ASSET_ROOT:", ASSET_ROOT)

SERVICE_ICON_ROOT  = ASSET_ROOT / "Architecture-Service-Icons_02072025"
RESOURCE_ICON_ROOT = ASSET_ROOT / "Resource-Icons_02072025"
CATEGORY_ICON_ROOT = ASSET_ROOT / "Category-Icons_02072025"
GROUP_ICON_ROOT    = ASSET_ROOT / "Architecture-Group-Icons_02072025"

print("SERVICE_ICON_ROOT :", SERVICE_ICON_ROOT.exists())
print("RESOURCE_ICON_ROOT:", RESOURCE_ICON_ROOT.exists())
print("CATEGORY_ICON_ROOT:", CATEGORY_ICON_ROOT.exists())
print("GROUP_ICON_ROOT   :", GROUP_ICON_ROOT.exists())

ASSET_ROOT: data/Asset-Package_02072025.dee42cd0a6eaacc3da1ad9519579357fb546f803
SERVICE_ICON_ROOT : True
RESOURCE_ICON_ROOT: True
CATEGORY_ICON_ROOT: True
GROUP_ICON_ROOT   : True


In [2]:
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("data")
df = pd.read_csv(DATA_ROOT / "aws_icons_mapped_coarse20_fine.csv")
df_unmatched = pd.read_csv(DATA_ROOT / "aws_icons_unmatched.csv")

print(df.shape, df_unmatched.shape)
df["matched"].value_counts()
df["coarse_class"].value_counts(dropna=False).head(30)
df["canonical_service_name"].value_counts(dropna=False).head(30)

(1882, 10) (1126, 10)


canonical_service_name
NaN                          1126
amazon s3                      25
amazon vpc                     23
amazon ec2                     23
amazon fsx                     22
amazon sagemaker               22
aws backup                     21
aws systems manager            20
amazon aurora                  18
amazon braket                  17
aws iot greengrass             15
amazon connect                 15
amazon cloudwatch              15
amazon dynamodb                14
amazon rds                     14
amazon eks                     13
amazon eventbridge             13
amazon redshift                12
aws glue                       12
amazon route 53                11
amazon opensearch service      11
aws waf                        11
aws storage gateway            10
aws trusted advisor             9
aws snowball                    9
amazon efs                      9
aws app mesh                    9
aws iot sitewise                9
aws iot analytics        

In [3]:
fine_counts = df[df["matched"]].groupby("canonical_service_name")["file_path"].count().sort_values(ascending=False)
usable_fine = fine_counts[fine_counts >= 5].index.tolist()  # 예: 아이콘 5개 이상만 사용
len(usable_fine), usable_fine[:20]

(64,
 ['amazon s3',
  'amazon ec2',
  'amazon vpc',
  'amazon sagemaker',
  'amazon fsx',
  'aws backup',
  'aws systems manager',
  'amazon aurora',
  'amazon braket',
  'aws iot greengrass',
  'amazon connect',
  'amazon cloudwatch',
  'amazon dynamodb',
  'amazon rds',
  'amazon eventbridge',
  'amazon eks',
  'amazon redshift',
  'aws glue',
  'aws waf',
  'amazon route 53'])

In [4]:
COARSE_CLASSES = [
    "Compute",
    "Networking",
    "Storage",
    "Database",
    "Containers & Orchestration",
    "Serverless & Event-driven",
    "Application Integration",
    "Analytics",
    "AI & Machine Learning",
    "Security & Identity",
    "Monitoring & Logging",
    "Management & Governance",
    "DevOps & Developer Tools",
    "Migration & Transfer",
    "Business Applications",
    "IoT",
    "Media Services",
    "Blockchain",
    "Quantum",
    "Robotics / AR-VR",
]

df_coarse = pd.DataFrame({"coarse_class": COARSE_CLASSES})
df_coarse


Unnamed: 0,coarse_class
0,Compute
1,Networking
2,Storage
3,Database
4,Containers & Orchestration
5,Serverless & Event-driven
6,Application Integration
7,Analytics
8,AI & Machine Learning
9,Security & Identity


In [5]:
# Cell 3: Fine-grained 서비스 리스트 (수동 정의, canonical + service_code + coarse_class + include_fine)

FINE_SERVICES = [
    # =========================
    # Compute
    # =========================
    {
        "canonical_service_name": "Amazon EC2",
        "service_code": "ec2",
        "coarse_class": "Compute",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Batch",
        "service_code": "batch",
        "coarse_class": "Compute",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Elastic Beanstalk",
        "service_code": "elasticbeanstalk",
        "coarse_class": "Compute",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Lightsail",
        "service_code": "lightsail",
        "coarse_class": "Compute",
        "include_fine": True,
    },

    # =========================
    # Networking
    # =========================
    {
        "canonical_service_name": "Amazon VPC",
        "service_code": "ec2",  # VPC API는 ec2 하위
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Elastic Load Balancing",
        "service_code": "elasticloadbalancing",
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Route 53",
        "service_code": "route53",
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Global Accelerator",
        "service_code": "globalaccelerator",
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Direct Connect",
        "service_code": "directconnect",
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Transit Gateway",
        "service_code": "ec2",
        "coarse_class": "Networking",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Cloud Map",
        "service_code": "servicediscovery",
        "coarse_class": "Networking",
        "include_fine": True,
    },

    # =========================
    # Storage
    # =========================
    {
        "canonical_service_name": "Amazon S3",
        "service_code": "s3",
        "coarse_class": "Storage",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon EFS",
        "service_code": "efs",
        "coarse_class": "Storage",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon FSx",
        "service_code": "fsx",
        "coarse_class": "Storage",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Backup",
        "service_code": "backup",
        "coarse_class": "Storage",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Storage Gateway",
        "service_code": "storagegateway",
        "coarse_class": "Storage",
        "include_fine": True,
    },

    # =========================
    # Database
    # =========================
    {
        "canonical_service_name": "Amazon RDS",
        "service_code": "rds",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Aurora",
        "service_code": "rds",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon DynamoDB",
        "service_code": "dynamodb",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Redshift",
        "service_code": "redshift",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon ElastiCache",
        "service_code": "elasticache",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon DocumentDB",
        "service_code": "docdb",
        "coarse_class": "Database",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Neptune",
        "service_code": "neptune",
        "coarse_class": "Database",
        "include_fine": True,
    },

    # =========================
    # Containers & Orchestration
    # =========================
    {
        "canonical_service_name": "Amazon ECS",
        "service_code": "ecs",
        "coarse_class": "Containers & Orchestration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon EKS",
        "service_code": "eks",
        "coarse_class": "Containers & Orchestration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Fargate",
        "service_code": "fargate",
        "coarse_class": "Containers & Orchestration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS App Runner",
        "service_code": "apprunner",
        "coarse_class": "Containers & Orchestration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon ECR",
        "service_code": "ecr",
        "coarse_class": "Containers & Orchestration",
        "include_fine": True,
    },

    # =========================
    # Serverless & Event-driven
    # =========================
    {
        "canonical_service_name": "AWS Lambda",
        "service_code": "lambda",
        "coarse_class": "Serverless & Event-driven",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon EventBridge",
        "service_code": "events",
        "coarse_class": "Serverless & Event-driven",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Step Functions",
        "service_code": "states",
        "coarse_class": "Serverless & Event-driven",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon SNS",
        "service_code": "sns",
        "coarse_class": "Serverless & Event-driven",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon SQS",
        "service_code": "sqs",
        "coarse_class": "Serverless & Event-driven",
        "include_fine": True,
    },

    # =========================
    # Application Integration
    # =========================
    {
        "canonical_service_name": "Amazon API Gateway",
        "service_code": "apigateway",
        "coarse_class": "Application Integration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS AppSync",
        "service_code": "appsync",
        "coarse_class": "Application Integration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon MQ",
        "service_code": "mq",
        "coarse_class": "Application Integration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon MSK",
        "service_code": "kafka",
        "coarse_class": "Application Integration",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS App Mesh",
        "service_code": "appmesh",
        "coarse_class": "Application Integration",
        "include_fine": True,
    },

    # =========================
    # Analytics
    # =========================
    {
        "canonical_service_name": "Amazon Kinesis Data Streams",
        "service_code": "kinesis",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Kinesis Data Firehose",
        "service_code": "firehose",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Kinesis Data Analytics",
        "service_code": "kinesisanalytics",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Glue",
        "service_code": "glue",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Athena",
        "service_code": "athena",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon EMR",
        "service_code": "emr",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon OpenSearch Service",
        "service_code": "opensearch",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon QuickSight",
        "service_code": "quicksight",
        "coarse_class": "Analytics",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Lake Formation",
        "service_code": "lakeformation",
        "coarse_class": "Analytics",
        "include_fine": True,
    },

    # =========================
    # AI & Machine Learning
    # =========================
    {
        "canonical_service_name": "Amazon SageMaker",
        "service_code": "sagemaker",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Comprehend",
        "service_code": "comprehend",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Rekognition",
        "service_code": "rekognition",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Textract",
        "service_code": "textract",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Transcribe",
        "service_code": "transcribe",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Polly",
        "service_code": "polly",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Translate",
        "service_code": "translate",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Bedrock",
        "service_code": "bedrock",
        "coarse_class": "AI & Machine Learning",
        "include_fine": True,
    },

    # =========================
    # Security & Identity
    # =========================
    {
        "canonical_service_name": "AWS Identity and Access Management (IAM)",
        "service_code": "iam",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Cognito",
        "service_code": "cognito-idp",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Key Management Service (KMS)",
        "service_code": "kms",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Secrets Manager",
        "service_code": "secretsmanager",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Shield",
        "service_code": "shield",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS WAF",
        "service_code": "waf",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon GuardDuty",
        "service_code": "guardduty",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Security Hub",
        "service_code": "securityhub",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Macie",
        "service_code": "macie",
        "coarse_class": "Security & Identity",
        "include_fine": True,
    },

    # =========================
    # Monitoring & Logging
    # =========================
    {
        "canonical_service_name": "Amazon CloudWatch",
        "service_code": "cloudwatch",
        "coarse_class": "Monitoring & Logging",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CloudTrail",
        "service_code": "cloudtrail",
        "coarse_class": "Monitoring & Logging",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS X-Ray",
        "service_code": "xray",
        "coarse_class": "Monitoring & Logging",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Config",
        "service_code": "config",
        "coarse_class": "Monitoring & Logging",
        "include_fine": True,
    },

    # =========================
    # Management & Governance
    # =========================
    {
        "canonical_service_name": "AWS Systems Manager",
        "service_code": "ssm",
        "coarse_class": "Management & Governance",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Control Tower",
        "service_code": "controltower",
        "coarse_class": "Management & Governance",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Organizations",
        "service_code": "organizations",
        "coarse_class": "Management & Governance",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS License Manager",
        "service_code": "license-manager",
        "coarse_class": "Management & Governance",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Trusted Advisor",
        "service_code": "trustedadvisor",
        "coarse_class": "Management & Governance",
        "include_fine": True,
    },

    # =========================
    # DevOps & Developer Tools
    # =========================
    {
        "canonical_service_name": "AWS CodeCommit",
        "service_code": "codecommit",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CodeBuild",
        "service_code": "codebuild",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CodeDeploy",
        "service_code": "codedeploy",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CodePipeline",
        "service_code": "codepipeline",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CodeArtifact",
        "service_code": "codeartifact",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS CloudFormation",
        "service_code": "cloudformation",
        "coarse_class": "DevOps & Developer Tools",
        "include_fine": True,
    },

    # =========================
    # Migration & Transfer
    # =========================
    {
        "canonical_service_name": "AWS Database Migration Service",
        "service_code": "dms",
        "coarse_class": "Migration & Transfer",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS DataSync",
        "service_code": "datasync",
        "coarse_class": "Migration & Transfer",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Snowball",
        "service_code": "snowball",
        "coarse_class": "Migration & Transfer",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Transfer Family",
        "service_code": "transfer",
        "coarse_class": "Migration & Transfer",
        "include_fine": True,
    },

    # =========================
    # Business Applications
    # =========================
    {
        "canonical_service_name": "Amazon WorkSpaces",
        "service_code": "workspaces",
        "coarse_class": "Business Applications",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon WorkDocs",
        "service_code": "workdocs",
        "coarse_class": "Business Applications",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon WorkMail",
        "service_code": "workmail",
        "coarse_class": "Business Applications",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Connect",
        "service_code": "connect",
        "coarse_class": "Business Applications",
        "include_fine": True,
    },
    {
        "canonical_service_name": "Amazon Chime",
        "service_code": "chime",
        "coarse_class": "Business Applications",
        "include_fine": True,
    },

    # =========================
    # IoT
    # =========================
    {
        "canonical_service_name": "AWS IoT Core",
        "service_code": "iot",
        "coarse_class": "IoT",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS IoT Greengrass",
        "service_code": "greengrass",
        "coarse_class": "IoT",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS IoT Analytics",
        "service_code": "iotanalytics",
        "coarse_class": "IoT",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS IoT SiteWise",
        "service_code": "sitewise",
        "coarse_class": "IoT",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS IoT Events",
        "service_code": "iotevents",
        "coarse_class": "IoT",
        "include_fine": True,
    },

    # =========================
    # Media Services
    # =========================
    {
        "canonical_service_name": "AWS Elemental MediaConvert",
        "service_code": "mediaconvert",
        "coarse_class": "Media Services",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Elemental MediaLive",
        "service_code": "medialive",
        "coarse_class": "Media Services",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Elemental MediaPackage",
        "service_code": "mediapackage",
        "coarse_class": "Media Services",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Elemental MediaStore",
        "service_code": "mediastore",
        "coarse_class": "Media Services",
        "include_fine": True,
    },
    {
        "canonical_service_name": "AWS Elemental MediaTailor",
        "service_code": "mediatailor",
        "coarse_class": "Media Services",
        "include_fine": True,
    },

    # =========================
    # Blockchain
    # =========================
    {
        "canonical_service_name": "Amazon Managed Blockchain",
        "service_code": "managedblockchain",
        "coarse_class": "Blockchain",
        "include_fine": True,
    },

    # =========================
    # Quantum
    # =========================
    {
        "canonical_service_name": "Amazon Braket",
        "service_code": "braket",
        "coarse_class": "Quantum",
        "include_fine": True,
    },

    # =========================
    # Robotics / AR-VR
    # =========================
    {
        "canonical_service_name": "AWS RoboMaker",
        "service_code": "robomaker",
        "coarse_class": "Robotics / AR-VR",
        "include_fine": True,
    },
]

df_fine = pd.DataFrame(FINE_SERVICES)
print("fine services 개수:", len(df_fine))
df_fine.head()


fine services 개수: 101


Unnamed: 0,canonical_service_name,service_code,coarse_class,include_fine
0,Amazon EC2,ec2,Compute,True
1,AWS Batch,batch,Compute,True
2,AWS Elastic Beanstalk,elasticbeanstalk,Compute,True
3,Amazon Lightsail,lightsail,Compute,True
4,Amazon VPC,ec2,Networking,True


In [6]:
def normalize_name(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.lower().strip()
    # AWS/Amazon prefix 제거
    t = re.sub(r"^(aws|amazon)\s+", "", t)
    # 특수문자 정리
    t = t.replace("&", "and")
    t = t.replace("/", " ")
    t = t.replace("_", " ")
    t = t.replace("-", " ")
    # service/services 제거
    drop = {"service", "services"}
    tokens = [w for w in re.split(r"\s+", t) if w and w not in drop]
    return " ".join(tokens).strip()


# canonical -> coarse/service_code 매핑 dict
SERVICE_TO_COARSE = {
    row["canonical_service_name"].lower(): row["coarse_class"]
    for _, row in df_fine.iterrows()
}

SERVICE_TO_CODE = {
    row["canonical_service_name"].lower(): row["service_code"]
    for _, row in df_fine.iterrows()
}

# Stage2 라벨 목록
FINE_LABELS = sorted(df_fine["canonical_service_name"].tolist())

print("Stage1 coarse classes:", len(COARSE_CLASSES))
print("Stage2 fine labels   :", len(FINE_LABELS))


Stage1 coarse classes: 20
Stage2 fine labels   : 101


In [7]:
def resolve_service_to_canonical(name: str):
    """
    아이콘 파일명에서 추출한 candidate_name을
    사람이 정의한 canonical_service_name 중 하나로 매핑.
    (단순 정규화 + 부분 매칭만, fuzzy는 나중 단계)
    """
    if not isinstance(name, str):
        return None

    norm = normalize_name(name)

    # 1) canonical 전체 정규화와 완전 일치
    for canonical in SERVICE_TO_COARSE.keys():
        if normalize_name(canonical) == norm:
            return canonical

    # 2) canonical 정규화가 candidate 안에 포함되는 경우
    for canonical in SERVICE_TO_COARSE.keys():
        if normalize_name(canonical) in norm:
            return canonical

    # 3) candidate가 canonical 안에 포함되는 경우
    for canonical in SERVICE_TO_COARSE.keys():
        if norm and norm in normalize_name(canonical):
            return canonical

    return None


def get_coarse_from_canonical(canonical: str):
    if not canonical:
        return None
    return SERVICE_TO_COARSE.get(canonical.lower())


def get_service_code_from_canonical(canonical: str):
    if not canonical:
        return None
    return SERVICE_TO_CODE.get(canonical.lower())

In [8]:
SIZE_SUFFIX_RE = re.compile(r"_(16|24|32|48|64|128)$")

def extract_name_from_stem(stem: str) -> str:
    """
    ex) Amazon-EC2_Instance_48 -> Amazon EC2 Instance
        AWS-Lambda_64          -> AWS Lambda
    """
    s = stem
    s = SIZE_SUFFIX_RE.sub("", s)
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def iter_icons(root: Path, icon_type: str):
    if not root.exists():
        return
    for svg in root.rglob("*.svg"):
        rel = svg.relative_to(ASSET_ROOT)
        yield {
            "icon_type": icon_type,
            "file_path": str(rel),
            "file_name": svg.name,
            "stem": svg.stem,
            "dir_name": svg.parent.name,
            "candidate_name": extract_name_from_stem(svg.stem),
        }


rows = []
rows.extend(list(iter_icons(SERVICE_ICON_ROOT,  "service")))
rows.extend(list(iter_icons(RESOURCE_ICON_ROOT, "resource")))
rows.extend(list(iter_icons(CATEGORY_ICON_ROOT, "category")))
rows.extend(list(iter_icons(GROUP_ICON_ROOT,    "group")))

df_icons = pd.DataFrame(rows)
print("Total icons scanned:", len(df_icons))
df_icons.head(20)


Total icons scanned: 1882


Unnamed: 0,icon_type,file_path,file_name,stem,dir_name,candidate_name
0,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_64.svg,Arch_AWS-Marketplace_Dark_64,64,Arch AWS Marketplace Dark
1,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_64.svg,Arch_AWS-Marketplace_Light_64,64,Arch AWS Marketplace Light
2,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_16.svg,Arch_AWS-Marketplace_Dark_16,16,Arch AWS Marketplace Dark
3,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_16.svg,Arch_AWS-Marketplace_Light_16,16,Arch AWS Marketplace Light
4,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_32.svg,Arch_AWS-Marketplace_Dark_32,32,Arch AWS Marketplace Dark
5,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_32.svg,Arch_AWS-Marketplace_Light_32,32,Arch AWS Marketplace Light
6,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_48.svg,Arch_AWS-Marketplace_Light_48,48,Arch AWS Marketplace Light
7,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_48.svg,Arch_AWS-Marketplace_Dark_48,48,Arch AWS Marketplace Dark
8,service,Architecture-Service-Icons_02072025/Arch_Quant...,Arch_Amazon-Braket_64.svg,Arch_Amazon-Braket_64,64,Arch Amazon Braket
9,service,Architecture-Service-Icons_02072025/Arch_Quant...,Arch_Amazon-Braket_16.svg,Arch_Amazon-Braket_16,16,Arch Amazon Braket


In [9]:
df_icons["canonical_service_name"] = df_icons["candidate_name"].apply(resolve_service_to_canonical)
df_icons["coarse_class"] = df_icons["canonical_service_name"].apply(get_coarse_from_canonical)
df_icons["service_code_mapped"] = df_icons["canonical_service_name"].apply(get_service_code_from_canonical)
df_icons["matched"] = df_icons["canonical_service_name"].notna()

print("매핑 성공 개수:", df_icons["matched"].sum())
print("매핑 실패 개수:", (~df_icons["matched"]).sum())

df_icons.head(30)

매핑 성공 개수: 756
매핑 실패 개수: 1126


Unnamed: 0,icon_type,file_path,file_name,stem,dir_name,candidate_name,canonical_service_name,coarse_class,service_code_mapped,matched
0,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_64.svg,Arch_AWS-Marketplace_Dark_64,64,Arch AWS Marketplace Dark,,,,False
1,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_64.svg,Arch_AWS-Marketplace_Light_64,64,Arch AWS Marketplace Light,,,,False
2,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_16.svg,Arch_AWS-Marketplace_Dark_16,16,Arch AWS Marketplace Dark,,,,False
3,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_16.svg,Arch_AWS-Marketplace_Light_16,16,Arch AWS Marketplace Light,,,,False
4,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_32.svg,Arch_AWS-Marketplace_Dark_32,32,Arch AWS Marketplace Dark,,,,False
5,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_32.svg,Arch_AWS-Marketplace_Light_32,32,Arch AWS Marketplace Light,,,,False
6,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Light_48.svg,Arch_AWS-Marketplace_Light_48,48,Arch AWS Marketplace Light,,,,False
7,service,Architecture-Service-Icons_02072025/Arch_Gener...,Arch_AWS-Marketplace_Dark_48.svg,Arch_AWS-Marketplace_Dark_48,48,Arch AWS Marketplace Dark,,,,False
8,service,Architecture-Service-Icons_02072025/Arch_Quant...,Arch_Amazon-Braket_64.svg,Arch_Amazon-Braket_64,64,Arch Amazon Braket,amazon braket,Quantum,braket,True
9,service,Architecture-Service-Icons_02072025/Arch_Quant...,Arch_Amazon-Braket_16.svg,Arch_Amazon-Braket_16,16,Arch Amazon Braket,amazon braket,Quantum,braket,True


In [10]:
out_all = DATA_ROOT / "aws_icons_mapped_coarse20_fine.csv"
out_unmatched = DATA_ROOT / "aws_icons_unmatched.csv"

df_icons.to_csv(out_all, index=False)
df_icons[~df_icons["matched"]].to_csv(out_unmatched, index=False)

print("전체 아이콘 매핑 결과:", out_all)
print("매핑 실패 아이콘 목록 :", out_unmatched)

전체 아이콘 매핑 결과: data/aws_icons_mapped_coarse20_fine.csv
매핑 실패 아이콘 목록 : data/aws_icons_unmatched.csv


In [11]:
import pandas as pd
from pathlib import Path

DATA_ROOT = Path("data")
df = pd.read_csv(DATA_ROOT / "aws_icons_mapped_coarse20_fine.csv")
df_unmatched = pd.read_csv(DATA_ROOT / "aws_icons_unmatched.csv")

print(df.shape, df_unmatched.shape)
df["matched"].value_counts()
df["coarse_class"].value_counts(dropna=False).head(30)
df["canonical_service_name"].value_counts(dropna=False).head(30)


(1882, 10) (1126, 10)


canonical_service_name
NaN                          1126
amazon s3                      25
amazon vpc                     23
amazon ec2                     23
amazon fsx                     22
amazon sagemaker               22
aws backup                     21
aws systems manager            20
amazon aurora                  18
amazon braket                  17
aws iot greengrass             15
amazon connect                 15
amazon cloudwatch              15
amazon dynamodb                14
amazon rds                     14
amazon eks                     13
amazon eventbridge             13
amazon redshift                12
aws glue                       12
amazon route 53                11
amazon opensearch service      11
aws waf                        11
aws storage gateway            10
aws trusted advisor             9
aws snowball                    9
amazon efs                      9
aws app mesh                    9
aws iot sitewise                9
aws iot analytics        

In [12]:
fine_counts = df[df["matched"]].groupby("canonical_service_name")["file_path"].count().sort_values(ascending=False)
usable_fine = fine_counts[fine_counts >= 5].index.tolist()  # 예: 아이콘 5개 이상만 사용
len(usable_fine), usable_fine[:20]

(64,
 ['amazon s3',
  'amazon ec2',
  'amazon vpc',
  'amazon sagemaker',
  'amazon fsx',
  'aws backup',
  'aws systems manager',
  'amazon aurora',
  'amazon braket',
  'aws iot greengrass',
  'amazon connect',
  'amazon cloudwatch',
  'amazon dynamodb',
  'amazon rds',
  'amazon eventbridge',
  'amazon eks',
  'amazon redshift',
  'aws glue',
  'aws waf',
  'amazon route 53'])

In [13]:
mask_exclude = df_unmatched["icon_type"].isin(["category", "group"])
df_unmatched_filtered = df_unmatched[~mask_exclude].copy()
df_unmatched_filtered.shape

(1011, 10)

In [14]:
from pathlib import Path
from PIL import Image
import cairosvg  # svg -> png 변환용 (설치 필요)

OUT_IMG_ROOT = Path("dataset/icons")

def export_icon(row, label_type="coarse"):
    src = ASSET_ROOT / row["file_path"]
    rel_dir = f"{label_type}/{row[label_type + '_class']}"
    dst_dir = OUT_IMG_ROOT / "images" / rel_dir
    dst_dir.mkdir(parents=True, exist_ok=True)

    stem = row["file_name"].rsplit(".", 1)[0]
    dst_path = dst_dir / f"{stem}.png"

    if src.suffix.lower() == ".svg":
        cairosvg.svg2png(url=str(src), write_to=str(dst_path), output_width=256, output_height=256)
    else:
        img = Image.open(src).convert("RGBA")
        img = img.resize((256, 256))
        img.save(dst_path)

    return dst_path

# 예: coarse용 데이터셋 이미지 생성
for _, r in df_train_coarse.iterrows():
    export_icon(r, label_type="coarse")

NameError: name 'df_train_coarse' is not defined