In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait

import datetime
import time
import csv
import re

In [2]:
driver = webdriver.Chrome()
driver.set_window_size(1400, 1000)

driver.get('http://www.okfactory.com/')

### 해결방안
- ID, CLASS, XPATH를 진행해도 값을 못 찾고 있었음
- parser을 통해서 html 태그들이 잘 있는지 확인
- 확인 결과, 바로 존재하지 않음. iframe으로 추정
- parser을 분석하니 frameset / frame 태그를  확인 -> 특정 프레임으로 이동해야함(driver.switch_to.frame)
- frame ID를 분석하니 main으로 나옴. 따라서, frame을 main으로 옮긴 후 찾기

In [3]:
import requests
from bs4 import BeautifulSoup

# 웹 페이지 가져오기
url = 'http://www.okfactory.com/'
response = requests.get(url)
response.raise_for_status()  # 요청이 성공했는지 확인

# HTML 파싱
soup = BeautifulSoup(response.text, 'html.parser')

print(soup)

<html>
<head>
<script async="" crossorigin="anonymous" src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-9104854776427493"></script>
<title>『공장통닷컴』공장부동산포털</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="./tong.ico" rel="shortcut icon"/>
<meta content="no-cache" http-equiv="Cache-Control">
<meta content="-1" http-equiv="Expires"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="aa739c736ca399c9b469521014705d66b55763f8" name="naver-site-verification"/>
<meta content="『공장통닷컴』공장부동산포털" name="description"/>
<meta content="공장매매,공장임대,시화공단,반월공단,지식산업센터,공장통" name="keywords"/>
<link href="http://okfactory.com/data/designImages/" rel="SHORTCUT ICON"/>
<script async="" data-ad-client="ca-pub-9104854776427493" src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
</meta></head>
<script language="JavaScript">
<!--
window.status='';
//-->
</script>
<frameset border="0" frameborder="NO" frames

### 조회버튼 클릭하기

In [4]:
driver.switch_to.frame(driver.find_element(By.NAME, "main"))

input_button = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located((By.XPATH, '/html/body/table/tbody/tr[1]/td[1]/table[2]/tbody/tr/td[3]/table/tbody/tr[1]/td/table/tbody/tr/td/table/tbody/tr/td[2]/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[6]/td[4]/table/tbody/tr/td[2]/input'))
)
input_button.click()

In [5]:
# Function to extract data from the current page
def extract_data():
    checks = driver.find_elements(By.CLASS_NAME, 'estate_M')

    numbers = []
    addresses_1 = []
    addresses_2 = []
    addresses_3 = []
    types = []
    titles = []
    prices = []
    lands = []
    urls = []
    statuses = []

    for i in range(len(checks)):
        text = checks[i].text.strip()
        if i % 8 == 0:
            numbers.append(text)
        elif i % 8 == 2:
            if '\n' in text:
                address, type_info = text.split('\n')
                type_info = type_info.replace('[', '-').replace(']', '')
                types.append(type_info)
            else:
                address = text
                types.append('')
            address_parts = address.split()
            addresses_1.append(address_parts[0] if len(address_parts) > 0 else '')
            addresses_2.append(address_parts[1] if len(address_parts) > 1 else '')
            addresses_3.append(address_parts[2].replace('..', '') if len(address_parts) > 2 else '')
        elif i % 8 == 3:
            # Remove text inside square brackets and the brackets themselves
            titles.append(re.sub(r'\[.*?\]', '', text).strip())
            # Extract href attribute
            try:
                href = checks[i].find_element(By.TAG_NAME, 'a').get_attribute('href')
                urls.append(href)
            except:
                urls.append('')
            # Extract img src and determine status
            try:
                img_src = checks[i].find_element(By.TAG_NAME, 'img').get_attribute('src')
                if "icon_go.gif" in img_src:
                    statuses.append("진행")
                elif "icon_stop.gif" in img_src:
                    statuses.append("완료")
                else:
                    statuses.append("중단")
            except:
                statuses.append("중단")
        elif i % 8 == 5:
            # Remove "매매가격:" and "월세가격:" from the prices
            cleaned_price = re.sub(r'(매매가격\s*:\s*|월세가격\s*:\s*)', '', text).strip()
            prices.append(cleaned_price)
        elif i % 8 == 7:
            lands.append(text)

    # Extracting "방식" column data
    td_elements_ways = driver.find_elements(By.XPATH, '//td[@name="방식"]')
    ways = []

    for td in td_elements_ways:
        img_elements = td.find_elements(By.TAG_NAME, 'img')
        alt_texts = [img.get_attribute('alt') for img in img_elements]
        ways.append(", ".join(alt_texts))

    # Extracting "참고" column data
    td_elements_references = driver.find_elements(By.XPATH, '//td[contains(@style, "padding-top:5px;")]')
    references = [td.text.replace('-', '').strip() for td in td_elements_references[24:]]

    # Extracting "조회수" column data
    review_elements = driver.find_elements(By.CLASS_NAME, 'estat_numberM')
    review_texts = [review.text for review in review_elements]

    return zip(numbers, addresses_1, addresses_2, addresses_3, types, titles, statuses, prices, lands, ways, references, review_texts, urls)


In [6]:
# Prepare the CSV file with UTF-8 encoding
with open('tong.csv', 'w', newline='', encoding='utf-8', errors='ignore') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['순번', '매물위치(1)', '매물위치(2)', '매물위치(3)', '종류', '제목', '상태', '가격', '면적', '방식', '참고', '조회수', '매물위치(url)'])

    for page in range(1, 450):  # Adjust range for the number of pages you have
        # Extract data from the current page
        page_data = extract_data()
        for row in page_data:
            csv_writer.writerow(row)
        
        # Find and click the next page link or the next button
        try:
            next_page_link = driver.find_element(By.LINK_TEXT, f"[{page + 1}]")
            driver.execute_script("arguments[0].blur(); arguments[0].click();", next_page_link)
            # Wait for the page to load
            time.sleep(2)  # Adjust the sleep time as needed
        except Exception as e:
            print(f"Page {page + 1} not found. Trying to click the next button.")
            try:
                next_button = driver.find_element(By.XPATH, '//img[@src="./images/next.gif"]/parent::a')
                driver.execute_script("arguments[0].blur(); arguments[0].click();", next_button)
                # Wait for the page to load
                time.sleep(2)  # Adjust the sleep time as needed
            except Exception as e:
                print(f"Failed to go to the next page: {e}")
                break

print('크롤링 완료')

Page 11 not found. Trying to click the next button.
Page 21 not found. Trying to click the next button.
Page 31 not found. Trying to click the next button.
Page 41 not found. Trying to click the next button.
Page 51 not found. Trying to click the next button.
Page 61 not found. Trying to click the next button.
Page 71 not found. Trying to click the next button.
Page 81 not found. Trying to click the next button.
Page 91 not found. Trying to click the next button.
Page 101 not found. Trying to click the next button.
Page 111 not found. Trying to click the next button.
Page 121 not found. Trying to click the next button.
Page 131 not found. Trying to click the next button.
Page 141 not found. Trying to click the next button.
Page 151 not found. Trying to click the next button.
Page 161 not found. Trying to click the next button.
Page 171 not found. Trying to click the next button.
Page 181 not found. Trying to click the next button.
Page 191 not found. Trying to click the next button.
Pa