In [None]:
import requests
from bs4 import BeautifulSoup

# 1. 웹페이지 가져오기
url = 'https://abs.gov.au/census/find-census-data/quickstats/2021/1GSYD'
response = requests.get(url)

# 2. 파싱(Parsing)하기
soup = BeautifulSoup(response.text, 'html.parser')

# 3. 데이터 추출 (예: 모든 제목 h1 태그 가져오기)
titles = soup.find_all('div')
h2_list = []
for title in titles:
    h2_list.append(title.text) # print(title.text)

print(h2_list)



['ABS Main Menu', 'Breadcrumb', 'Search all persons QuickStats for another area', 'People and population', 'Education', 'Cultural diversity', 'Income and work', 'Unpaid work and care', 'Health', 'Australian Defence Force service', 'Families', 'Dwellings', 'Housing', 'Aboriginal and/or Torres Strait Islander peoples', 'Statistics', 'About', 'Social', 'Footer - Bottom']


In [7]:
target_section = None
for h2 in soup.find_all('h2'):
    if 'People and population' in h2.get_text():
        print(f'{h2} found')
        target_section = h2
        break

extracted_data = []

if target_section:
        # 3. 해당 h2 다음 형제 요소들을 순회하며 테이블 수집
        # 다음 h2(다른 섹션)가 나오기 전까지의 모든 'qstable' 클래스 테이블을 찾습니다.
        current_node = target_section.find_next_sibling()
        
        while current_node and current_node.name != 'h2':
            # 테이블이고 클래스명이 'qstable'인 경우만 처리
            if current_node.name == 'table' and 'qstable' in current_node.get('class', []):
                print("table found")
                rows = current_node.find_all('tr')
                
                for row in rows:
                    # th(항목명)와 td(수치 데이터) 추출
                    cols = row.find_all(['th', 'td'])
                    cols = [ele.get_text(separator=" ").strip() for ele in cols]
                    
                    # 빈 줄이 아닌 경우에만 추가
                    if cols:
                        extracted_data.append(cols)
            
            # 다음 요소로 이동
            current_node = current_node.find_next_sibling()

<h2 class="tw-text-xl" id="people-and-population">People and population</h2> found


In [4]:
import pandas as pd 
if extracted_data:
        df = pd.DataFrame(extracted_data)
        
        # CSV 파일로 저장 (한글이나 특수기호 깨짐 방지를 위해 utf-8-sig 사용)
        output_file = 'greater_sydney_people_data.csv'
        df.to_csv(output_file, index=False, header=False, encoding='utf-8-sig')
        print(f"성공적으로 데이터를 추출하여 '{output_file}'에 저장했습니다.")
else:
    print("데이터를 찾지 못했습니다. 태그 구조를 다시 확인해주세요.")

데이터를 찾지 못했습니다. 태그 구조를 다시 확인해주세요.


In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://abs.gov.au/census/find-census-data/quickstats/2021/1GSYD"
headers = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

all_data = []

# 1. 'qsTable' 클래스를 가진 모든 div를 찾습니다.
tables = soup.select('div.qsTable')

for table_div in tables:
    # 2. 각 div 안에 있는 실제 table 태그를 찾습니다.
    table = table_div.find('table')
    if not table:
        continue
    
    # 3. 모든 행(tr)을 순회합니다.
    rows = table.find_all('tr')
    for row in rows:
        # th(항목명)와 td(수치)를 모두 찾습니다.
        cells = row.find_all(['th', 'td'])
        
        # 텍스트만 추출하고, 내부의 <br>이나 공백을 정리합니다.
        row_data = [cell.get_text(separator=" ").strip() for cell in cells]
        
        if row_data:
            all_data.append(row_data)

# 4. 데이터프레임으로 변환
# 데이터의 첫 줄이 헤더(People, Greater Sydney...)이므로 이를 활용하거나, 
# 여러 테이블이 합쳐질 경우를 대비해 컬럼명을 수동으로 지정할 수 있습니다.
df = pd.DataFrame(all_data)

# 5. CSV 저장
df.to_csv('greater_sydney_census.csv', index=False, header=False, encoding='utf-8-sig')

print(f"총 {len(all_data)}행의 데이터를 수집하여 'greater_sydney_census.csv'로 저장했습니다.")

총 278행의 데이터를 수집하여 'greater_sydney_census.csv'로 저장했습니다.


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://abs.gov.au/census/find-census-data/quickstats/2021/1GSYD"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

final_data = []
current_category = "Unknown"

# 전체 본문 컨테이너 내의 요소들을 순차적으로 탐색합니다.
# 'main' 또는 특정 컨테이너를 타겟팅하면 더 정확합니다.
main_content = soup.find('main') or soup.find('body')

# 순차적 탐색을 위해 모든 자식 요소를 확인
for element in main_content.find_all(['div', 'h2']):
    
    # 1. Category 업데이트: h2 태그를 만나면 카테고리명을 변경
    if element.name == 'h2':
        current_category = element.get_text().strip()
        continue

    # 2. qsTable 클래스를 가진 div를 만났을 때
    if 'qsTable' in element.get('class', []):
        table = element.find('table')
        if not table:
            continue
            
        rows = table.find_all('tr')
        if not rows:
            continue
            
        # --- Feature(소분류) 추출 ---
        # 첫 번째 행(Header Row)의 첫 번째 th에서 Feature명을 가져옵니다.
        # 예: "People <br><span><em>All people</em></span>" -> "People"만 추출
        header_row = rows[0]
        first_th = header_row.find('th')
        
        if first_th:
            # <br>이나 <span> 이전의 순수 텍스트만 가져오기 위해 
            # 첫 번째 텍스트 노드만 추출하거나 구분자로 분리합니다.
            feature_name = first_th.get_text(separator="|").split('|')[0].strip()
        else:
            feature_name = "Unknown Feature"

        # --- 데이터 행 추출 ---
        # 두 번째 행부터 실제 데이터를 추출합니다.
        for row in rows[1:]:
            cells = row.find_all(['th', 'td'])
            row_values = [cell.get_text().strip() for cell in cells]
            
            if row_values:
                # [Category, Feature, Label(Male/Female 등), 나머지 수치들...] 순서로 결합
                full_row = [current_category, feature_name] + row_values
                final_data.append(full_row)

# 3. 데이터프레임 생성 및 컬럼명 설정
columns = [
    'Category', 'Feature', 'Label', 
    'Greater_Sydney_cnt', 'Greater_Sydney_ratio', 
    'NSW_cnt', 'NSW_ratio', 
    'Australia_cnt', 'Australia_ratio'
]

# 실제 추출된 데이터의 컬럼 수와 맞는지 확인 후 생성
df = pd.DataFrame(final_data)

# 컬럼 수 조절 (추출된 데이터 열 개수에 따라 columns 리스트를 유연하게 적용)
if not df.empty:
    df.columns = columns[:len(df.columns)]

# 4. CSV 저장
output_filename = 'abs_census_greater_sydney_structured.csv'
df.to_csv(output_filename, index=False, encoding='utf-8-sig')

print(f"저장이 완료되었습니다: {output_filename}")
print(df.head(10)) # 결과물 상위 10개 미리보기

저장이 완료되었습니다: abs_census_greater_sydney_structured.csv
                Category            Feature  \
0  People and population             People   
1  People and population             People   
2  People and population  Indigenous status   
3  People and population  Indigenous status   
4  People and population  Indigenous status   
5  People and population                Age   
6  People and population                Age   
7  People and population                Age   
8  People and population                Age   
9  People and population                Age   

                                      Label Greater_Sydney_cnt  \
0                                      Male          2,585,238   
1                                    Female          2,645,912   
2  Aboriginal and/or Torres Strait Islander             90,939   
3                            Non-Indigenous          4,916,487   
4              Indigenous status not stated            223,727   
5                               

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_abs_census(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 1. 지역명 추출 (예: Greater Sydney, Melbourne 등)
        # 페이지 상단의 h1 태그나 특정 요소를 통해 지역명을 가져옵니다.
        region_name = soup.find('h1').get_text().replace('2021 Census QuickStats:', '').strip()
        
        extracted_rows = []
        current_category = "Unknown"
        main_content = soup.find('main') or soup.find('body')

        for element in main_content.find_all(['div', 'h2']):
            if element.name == 'h2':
                current_category = element.get_text().strip()
                continue

            if 'qsTable' in element.get('class', []):
                table = element.find('table')
                if not table: continue
                
                rows = table.find_all('tr')
                if not rows: continue
                
                # Feature명 추출
                first_th = rows[0].find('th')
                feature_name = first_th.get_text(separator="|").split('|')[0].strip() if first_th else "Unknown"

                for row in rows[1:]:
                    cells = row.find_all(['th', 'td'])
                    row_values = [cell.get_text().strip() for cell in cells]
                    if row_values:
                        # [지역명, 카테고리, 피처, 라벨, 데이터...] 순으로 구성
                        extracted_rows.append([region_name, current_category, feature_name] + row_values)
        
        return extracted_rows
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return []

# --- 실행 부분 ---

# 2. 수집하고 싶은 지역 URL 리스트
urls = [
    "https://abs.gov.au/census/find-census-data/quickstats/2021/1GSYD", # Greater Sydney
    # Lower group
    "https://abs.gov.au/census/find-census-data/quickstats/2021/IQS116031313", #Bidwill
    "https://abs.gov.au/census/find-census-data/quickstats/2021/IQS116031316", #Lethbridge park
    "https://abs.gov.au/census/find-census-data/quickstats/2021/IQS124051581", #St Marys
    "https://abs.gov.au/census/find-census-data/quickstats/2021/IQS127011504", #Ashcroft
    # Higher group
    "https://abs.gov.au/census/find-census-data/quickstats/2021/115011296", #West Pennant Hills
    "https://abs.gov.au/census/find-census-data/quickstats/2021/118011346", #Rose Bay
    "https://abs.gov.au/census/find-census-data/quickstats/2021/121031408", #Lindfield
    "https://abs.gov.au/census/find-census-data/quickstats/2021/126021499", # Hunters Hill
    # Abnormal group
    "https://abs.gov.au/census/find-census-data/quickstats/2021/117031642", #Redfern 
    "https://abs.gov.au/census/find-census-data/quickstats/2021/117031645", #Haymarket 
    "https://abs.gov.au/census/find-census-data/quickstats/2021/117031646", #Ultimo 
    "https://abs.gov.au/census/find-census-data/quickstats/2021/117031647", #Waterloo 
    "https://abs.gov.au/census/find-census-data/quickstats/2021/120031678", #Burwood 

]

all_region_data = []

for url in urls:
    print(f"수집 중: {url}")
    data = scrape_abs_census(url)
    all_region_data.extend(data)
    # 서버 부하 방지를 위해 잠깐 대기 (매너 크롤링)
    time.sleep(1) 

# 3. 데이터프레임 생성
columns = [
    'Region', 'Category', 'Feature', 'Label', 
    'Area_cnt', 'Area_ratio', 
    'State_cnt', 'State_ratio', 
    'Country_cnt', 'Country_ratio'
]

df = pd.DataFrame(all_region_data)
if not df.empty:
    df.columns = columns[:len(df.columns)]
    # CSV 저장
    df.to_csv('combined_census_data.csv', index=False, encoding='utf-8-sig')
    print(f"\n총 {len(urls)}개 지역 데이터 수집 완료! 'combined_census_data.csv' 확인해보세요.")

수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/1GSYD
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS116031313
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS116031316
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS124051581
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS127011504
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS115011296
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS118011346
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS121031408
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS126021499
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS117031642
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS117031645
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS117031646
수집 중: https://abs.gov.au/census/find-census-data/quickstats/2021/IQS117

In [13]:
import pandas as pd

df = pd.read_csv('combined_census_data.csv')

df.head()

Unnamed: 0,Region,Category,Feature,Label,Area_cnt,Area_ratio,State_cnt,State_ratio,Country_cnt,Country_ratio
0,Greater Sydney,People and population,People,Male,2585238,49.4,3984166,49.4,12545154,49.3
1,Greater Sydney,People and population,People,Female,2645912,50.6,4087995,50.6,12877635,50.7
2,Greater Sydney,People and population,Indigenous status,Aboriginal and/or Torres Strait Islander,90939,1.7,278043,3.4,812728,3.2
3,Greater Sydney,People and population,Indigenous status,Non-Indigenous,4916487,94.0,7404499,91.7,23375949,91.9
4,Greater Sydney,People and population,Indigenous status,Indigenous status not stated,223727,4.3,389616,4.8,1234112,4.9


In [None]:
lower_suburb = [] 