#Crawling KAAC symbols list

Crawling Korean AAC symbols from the following webpage : http://symbol.ksaac.or.kr/searchsymbols/introduction.jsp


##0. Setup

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1. Crawl category list

In [16]:
url = 'http://symbol.ksaac.or.kr/searchsymbols'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    buttons = soup.find_all('button', class_='w3-large w3-left-align category-l1-name')
    categories = []
    # Iterate through each button and extract the category name and id
    for button in buttons:
        content = button.text.strip()
        button_id = button.get('id')
        categories.append([content, button_id])

    category_df = pd.DataFrame(categories, columns=['Category', 'ID'])
    print(category_df)

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

                 Category    ID
0                   기본기능키    c2
1                      음식    c3
2                      학교    c4
3                       집    c5
4                  놀이, 여가    c6
5                      건강    c7
6                      교통    c8
7                    지역사회    c9
8                     스케줄   c10
9                      종교   c11
10                    스포츠   c12
11                      색   c13
12                 동물, 식물   c14
13                     인물   c15
14                     사람   c16
15                     국가   c17
16                    미분류   c18
17             결혼, 출산, 육아   c19
18                   대학생활   c20
19           복지, 지원, 주민센터   c21
20                 장애피해상황   c22
21                     직업   c23
22  휠체어, 휴대폰, 자동차정비, AS센터   c24
23                위톡 그림상징  c156
24                 커뮤니 상징  c157


##2. Crawl symbols

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Set to store the crawled symbols (no duplicates)
symbols = set()

for id in category_df['ID']:
  category = id[1:]  # remove 'c' in front of ids
  url = f'http://symbol.ksaac.or.kr/searchsymbols/searchbycategory?cago={category}'
  response = requests.get(url)

  if response.status_code == 200:
      soup = BeautifulSoup(response.content, 'html.parser')

      # Each symbol is stored with the tag below
      expressions = soup.find_all('span', class_='s-expression')

      # Iterate through each <span> and extract the text content
      for expression in expressions:
          content = expression.text.strip()
          symbols.add(content)

  else:
      print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

symbol_df = pd.DataFrame(list(symbols), columns=['Symbol'])
symbol_df.to_csv('symbol_df.csv', index=False)
print(symbol_df)

         Symbol
0            73
1            실수
2         따끔거려요
3     교통사고가-났어요
4        정리해주세요
...         ...
7265        동식물
7266         복어
7267       평면도형
7268       페브리즈
7269    무슨과목좋아해

[7270 rows x 1 columns]


Without removing duplicates, there are 10359 symbols in total.