In [None]:
# Create list of ID which is valid to get
import pandas as pd

df = pd.read_csv('/home/ubuntu/data_value/KDX+국가교통데이터오픈마켓+해양수산_20250806.csv')

ids = df.loc[df['source'] == 'bigdata-sea']['filename'].str.replace('page_', '').str.replace('.html','').to_list()

ids

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import traceback
from multiprocessing import Pool, current_process

# Re-download using known ID
base_dir = '/home/ubuntu/data_value/bigdata-sea-html'
os.makedirs(base_dir, exist_ok=True)

def crawl_single(i):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-infobars')
    
    driver = webdriver.Chrome(options=options)
    try:
        url = f"https://www.bigdata-sea.kr/datasearch/base/view.do?prodId=PROD_{i}"
        driver.get(url)
        
        # Wait until sample data has been rendered
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//*[contains(@class, "tui-grid-cell-content")]'))
        )
        
        html_source = driver.page_source
        
        file_path = os.path.join(base_dir, f'{i}.html')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html_source)
        
        print(f"[{current_process().name}] Finished id {i}")
        
    except Exception:
        print(f"[{current_process().name}] Error on id {i}")
        traceback.print_exc()
    finally:
        driver.quit()

if __name__ == "__main__":
    import multiprocessing    
    
    num_processes = 12
    
    with Pool(num_processes) as pool:
        pool.map(crawl_single, ids)


In [None]:
# The PRDID who does not have sample data and cannot be measured
error_ids = ['001345', '001286', '001344', '001343', '001347', '001348', '001340', '001339', '001341', '001342', '001346']

In [None]:
import os
from bs4 import BeautifulSoup

html_dir = '/home/ubuntu/data_value/bigdata-sea-html' 
results = []

for filename in ids:
    if not filename.endswith('.html'):
        continue

    file_path = os.path.join(html_dir, filename)

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html = f.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='euc-kr') as f:
            html = f.read()

    soup = BeautifulSoup(html, 'html.parser')

    # 카테고리 추출
    category_tag = soup.select_one('.warp-left .cate .badge')
    category = category_tag.get_text(strip=True) if category_tag else None

    # 확장자 추출
    extension = None
    for div in soup.select('.inner-table .inner-tr'):
        th = div.find('div', class_='th')
        td = div.find('div', class_='td')
        if th and '파일타입' in th.get_text(strip=True):
            extension = td.get_text(strip=True)
            break

    # 회사명 추출
    company = None
    for div in soup.select('.inner-table .inner-tr'):
        th = div.find('div', class_='th')
        td = div.find('div', class_='td')
        if th and '제공자' in th.get_text(strip=True):
            company = td.get_text(strip=True)
            break

    item = {
        '파일명': filename,
        '카테고리': category,
        '확장자': extension,
        '회사명': company
    }

    results.append(item)

    df.loc[df['filename'] == f'page_{filename}', 'category'] = category
    df.loc[df['filename'] == f'page_{filename}', 'extension'] = extension
    df.loc[df['filename'] == f'page_{filename}', 'company'] = company

    print(item)


In [None]:
import os
from bs4 import BeautifulSoup

html_dir = '/home/ubuntu/data_value/bigdata-sea-html'  # 경로 수정!
all_tags = []

for filename in os.listdir(html_dir):
    if filename.endswith('.html'):
        filepath = os.path.join(html_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                html = f.read()
        except UnicodeDecodeError:
            with open(filepath, 'r', encoding='cp949') as f:
                html = f.read()

        soup = BeautifulSoup(html, 'html.parser')
        
        hashtag_div = soup.find('div', class_='hashtag')
        tags = []
        if hashtag_div:
            tags = [a.text.strip('# \n') for a in hashtag_div.find_all('a') if a.text.startswith('#')]

            df.loc[df['filename'] == f'page_{filename}', 'tags'] = str(tags)

            print(tags)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import traceback
from multiprocessing import Pool, current_process

base_dir = '/home/ubuntu/data_value/bigdata-sea-html-2'
os.makedirs(base_dir, exist_ok=True)

def crawl_single(i):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-infobars')
    
    driver = webdriver.Chrome(options=options)
    try:
        url = f"https://www.bigdata-sea.kr/datasearch/base/view.do?prodId=PROD_{i}"
        driver.get(url)
        
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, '//*[contains(@class, "tui-grid-cell-content")]'))
        )
        
        html_source = driver.page_source
        
        file_path = os.path.join(base_dir, f'{i}.html')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html_source)
        
        print(f"[{current_process().name}] Finished id {i}")
        
    except Exception:
        print(f"[{current_process().name}] Error on id {i}")
        traceback.print_exc()
    finally:
        driver.quit()

if __name__ == "__main__":
    import multiprocessing
    
    # 프로세스 수는 CPU 코어수나 원하는 수로 조절
    num_processes = 12
    
    with Pool(num_processes) as pool:
        pool.map(crawl_single, error_ids)


In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

df = pd.read_csv('/home/ubuntu/data_value/KDX+국가교통데이터오픈마켓+해양수산_20250806_2.csv', dtype=str)

# -1: Unknown
df['data_count'] = -1 

base_dir = '/home/ubuntu/data_value/bigdata-sea-html-2'

for root, dirs, files in os.walk(base_dir):
    for filename in files:
        file_path = os.path.join(root, filename)

        with open(file_path, 'r') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'html.parser')

        # 텍스트가 '파일용량(건수)'인 div를 찾고
        label_div = soup.find('div', class_='th', string='파일용량(건수)')

        if label_div:
            # 그 다음 형제 div (td)를 가져온다
            data_div = label_div.find_next_sibling('div', class_='td')
            if data_div:
                row_count_text = data_div.get_text(strip=True).replace('건', '').replace(',','').replace(' ', '')
                print(f"Found row count: {row_count_text}")
                df.loc[df['filename'] == f'page_{filename}', 'data_count'] = int(row_count_text)                

df.to_csv('/home/ubuntu/data_value/KDX+국가교통데이터오픈마켓+해양수산_20250806_3.csv', index=False)

In [None]:
import os
from bs4 import BeautifulSoup
from collections import defaultdict

df = pd.read_csv('/home/ubuntu/data_value/KDX+국가교통데이터오픈마켓+해양수산_20250806_3.csv', dtype=str)

base_dir = '/home/ubuntu/data_value/bigdata-sea-html-2'

for root, dirs, files in os.walk(base_dir):
    for filename in files:
        file_path = os.path.join(root, filename)

        with open(file_path, 'r') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'html.parser')

        # 1. Find all <td> elements with data-row-key attribute
        td_elements = soup.find_all('td', attrs={'data-row-key': True})

        # 2. Group text contents by data-row-key
        rows = defaultdict(list)

        for td in td_elements:
            row_key = td['data-row-key']
            div = td.find('div', class_='tui-grid-cell-content')
            if div and div.text:
                text = div.text.strip()
                rows[row_key].append(text)

        # 3. For each row, sum the length of all column texts
        row_lengths = []
        for key, texts in rows.items():
            total_len = sum(len(t) for t in texts)
            row_lengths.append(total_len)
            print(f'Row {key} total length: {total_len}')

        # 4. Calculate average length per row
        average_length = sum(row_lengths) / len(row_lengths) if row_lengths else 0

        data_count = int(df.loc[df['filename'] == f'page_{filename}', 'data_count'])

        if data_count >= 0:
            data_size = round(average_length * data_count)
            df.loc[df['filename'] == f'page_{filename}', 'data_size'] = data_size
            print(f'Average total text length per row: {average_length} / size: {data_size}')

df.to_csv('/home/ubuntu/data_value/KDX+국가교통데이터오픈마켓+해양수산_20250806_4.csv', index=False)