In [119]:
# 스크래핑 대상 및 최종 자료 파일 이름 정의

country = 'us'          # 국가 코드
app_id = '1163786766'   # 앱스토워 웹페이지에서 확인 가능
app_name = 'alarmy'     # 아무거나 입력해도 됨
scrap_date = '20230129' # 스크래핑 일자 (달라도 상관 없음)
last_page = 10          # 현재 10페이지까지만 지원됨
file_name = f'scrap_apple_{app_name}_{country}_{scrap_date}.csv' # 저장할 파일 이름

# 국가코드 -- 한국 : kr / 미국 : us ...
# 그 외 국가코드는 https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2

In [113]:
# 스크랩 메서드 정의

def scrapping_alarmy_apple(page_num, app_id, country):
    from bs4 import BeautifulSoup
    import urllib.request
    url_start = f'https://itunes.apple.com/{country}/rss/customerreviews/page='
    url_end = f'/id={app_id}/sortby=mostrecent/xml?urlDesc=/customerreviews/id={app_id}/sortBy=mostRecent/xml'
    html = urllib.request.urlopen(url_start + str(page_num) + url_end)
    
    soup = BeautifulSoup(html, 'html.parser')
    soups = soup.find_all('entry')
    
    title = []
    content = []
    rating = []
    date = []
    version = []
    name = []
    
    for soup in soups:
        try:
            title.append(soup.find('title').string)
        except:
            title.append('None')
        try:
            content.append(soup.find('content', attrs={'type':'text'}).string)
        except:
            content.append('None')
        try:
            rating.append(soup.find('im:rating').string)
        except:
            rating.append('None')
        try:
            date.append(soup.find('updated').string)
        except:
            date.append('None')
        try:
            version.append(soup.find('im:version').string)
        except:
            version.append('None')
        try:
            name.append(soup.find('name').string)
        except:
            name.append('None')
    
    return title, content, rating, date, version, name

In [114]:
# 스크랩 실행

ls_title = []
ls_content = []
ls_rating = []
ls_date = []
ls_version = []
ls_name = []

for i in range(1, last_page + 1):
    title, content, rating, date, version, name = scrapping_alarmy_apple(i, app_id, country)
    
    ls_title.append(title)
    ls_content.append(content)
    ls_rating.append(rating)
    ls_date.append(date)
    ls_version.append(version)
    ls_name.append(name)

title = [ x.string for comps in ls_title for x in comps ]
content = [ x.string for comps in ls_content for x in comps ]
rating = [ x.string for comps in ls_rating for x in comps ]
date = [ x.string for comps in ls_date for x in comps ]
version = [ x.string for comps in ls_version for x in comps ]
name = [ x.string for comps in ls_name for x in comps ]

In [117]:
# csv 파일로 저장

import pandas as pd

df = pd.DataFrame([title, content, rating, date, version, name]).T
df.columns = ['title', 'content', 'rating', 'date', 'version', 'author']
df.to_csv(file_name)