# webscraping

In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import re

def click_page_number(driver, page_num):
    """
    Clicks on the pagination link to navigate to a specific page.
    """
    try:
        # Find the pagination link based on the page number
        page_link = driver.find_element(
            By.XPATH, 
            f"//div[@class='pagination']//a[contains(@onclick, 'fn_egov_link_page({page_num})')]"
        )
        # Execute JavaScript to trigger the page change function
        driver.execute_script(f"fn_egov_link_page({page_num}); return false;")
        time.sleep(1)  # Wait for the page to load
        return True
    except NoSuchElementException:
        return False

def scrape_space_info():
    """
    Scrapes information about spaces available for rent from the specified website.
    """
    # Set up the Chrome WebDriver
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)  # Implicit wait for elements to appear
    
    try:
        # Open the webpage
        driver.get("https://youth.seoul.go.kr/orang/rent/list.do?key=2309210006")
        time.sleep(1)  # Allow time for the page to load
        
        spaces_info = []  # List to store scraped data
        current_page = 1  # Start from the first page
        
        while True:
            print(f"Current Page: {current_page}")
            
            # Find all space links on the current page
            space_links = driver.find_elements(By.CSS_SELECTOR, "div.gallery-list-st1 .list a")
            
            # Iterate through each space link to extract information
            for link in space_links:
                try:
                    # Extract space ID from the onclick attribute
                    onclick_value = link.get_attribute('onclick')
                    space_id = re.search(r"goView\('(\d+)'\)", onclick_value).group(1)
                    
                    # Open the space details page using JavaScript
                    driver.execute_script(f"goView('{space_id}')")
                    time.sleep(1)  # Wait for page transition
                    
                    info = {}
                    
                    # Extract space name
                    try:
                        name = driver.find_element(By.CSS_SELECTOR, "div.text strong.ti").text
                        info['이름'] = name.strip()
                    except:
                        info['이름'] = None
                    
                    # Extract space details
                    info_list = driver.find_elements(By.CSS_SELECTOR, "ul.info li")
                    
                    for item in info_list:
                        try:
                            label = item.find_element(By.TAG_NAME, "strong").text
                            value = item.find_element(By.TAG_NAME, "em").text
                            
                            mapping = {
                                '공간유형': '공간유형',
                                '수용인원': '수용인원',
                                '예약가능시간': '예약가능시간',
                                '예약인원': '예약인원',
                                '대관료': '대관료'
                            }
                            
                            for key, mapped_key in mapping.items():
                                if key in label:
                                    info[mapped_key] = value.strip()
                        
                        except Exception as e:
                            continue
                    
                    # Extract image URL
                    try:
                        img_element = driver.find_element(By.CSS_SELECTOR, "div.img img")
                        img_url = img_element.get_attribute('src')
                        info['이미지URL'] = img_url
                    except:
                        info['이미지URL'] = None
                    
                    # Extract homepage link
                    try:
                        homepage_element = driver.find_element(By.CSS_SELECTOR, "a.btn.btn-bace2.bg-main-color.fc-500")
                        homepage_url = homepage_element.get_attribute('href')
                        info['홈페이지'] = homepage_url
                    except:
                        info['홈페이지'] = None
                    
                    spaces_info.append(info)  # Store the extracted data
                    print(f"Collected Info: {info.get('이름', '이름 없음')}")

                    # Return to the main list
                    driver.back()
                    time.sleep(1)
                    
                except Exception as e:
                    print(f"Error while collecting space info: {str(e)}")
                    # Reload the main list page
                    driver.get("https://youth.seoul.go.kr/orang/rent/list.do?key=2309210006")
                    time.sleep(1)
                    # Navigate back to the current page
                    for i in range(1, current_page):
                        click_page_number(driver, i+1)
                    continue
            
            # Move to the next page
            current_page += 1
            if not click_page_number(driver, current_page):
                print("Reached the last page")
                break
        
        return spaces_info
            
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
        return []
        
    finally:
        driver.quit()  # Close the browser


In [32]:
# 스크래핑 실행 및 결과 저장
if __name__ == "__main__":
    results = scrape_space_info()
    
    # 결과 출력
    print("\n=== 수집된 전체 결과 ===")
    for idx, space in enumerate(results, 1):
        print(f"\n공간 {idx} 정보:")
        for key, value in space.items():
            print(f"{key}: {value}")
    
    print(f"\n총 {len(results)}개의 공간 정보가 수집되었습니다.")
    


Current Page: 1
Collected Info: 강서 ( [서울청년센터 강서] 오픈스페이스 )
Collected Info: 강서 ( [서울청년센터 강서] 스튜디오 )
Collected Info: 강서 ( [서울청년센터 강서] 회의실1 )
Collected Info: 강서 ( [서울청년센터 강서] 회의실3 )
Collected Info: 강서 ( [서울청년센터 강서] 회의실2 )
Collected Info: 강북 ( B1 스튜디오실 )
Collected Info: 도봉 ( 취업 스튜디오(2층) )
Collected Info: 동대문 ( 세미나실 )
Collected Info: 양천 ( 회의실A )
Collected Info: 양천 ( 다목적실B )
Collected Info: 양천 ( 다목적실A )
Collected Info: 성동 ( 서울청년센터 성동 4층 공유공간 )
Current Page: 2
Collected Info: 강동 ( 소강의실 )
Collected Info: 금천 ( 멀티미디어실 )
Collected Info: 금천 ( 회의실 )
Collected Info: 마포 ( 연결해랑 )
Collected Info: 마포 ( 이루어랑 )
Collected Info: 마포 ( 모여랑 )
Collected Info: 성동 ( 회의실 )
Collected Info: 노원 ( 배우다 )
Collected Info: 노원 ( 함께하다 )
Collected Info: 광진 ( 촬영소(所) )
Collected Info: 광진 ( 모여보소(所) )
Collected Info: 노원 ( 해보다 - 컴퓨터 이용 )
Current Page: 3
Collected Info: 노원 ( 해보다 - VR면접 )
Collected Info: 강북 ( 4F 회의실 )
Collected Info: 강북 ( B1 대교육실 )
Collected Info: 동대문 ( 미디어실(온라인면접실) )
Collected Info: 관악 ( 서재 )
Collected Info: 도봉 ( 회

# data to .csv

In [None]:
import csv

def save_places_to_csv(places, filename="space4youth.csv"):
    headers =["이름", "공간유형", "수용인원", "예약가능시간", "예약인원", "대관료", "이미지URL", "홈페이지"]
    
    # Write data to CSV
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(places)
        
    print(f"CSV file saved as {filename}")
    
    
save_places_to_csv(results)




CSV file saved as space4youth.csv


In [34]:
import pandas as pd
df_space_scraped= pd.read_csv("space4youth.csv")
df_space_scraped.to_clipboard()

# Supabase