In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import os

# === 設定 ===
FOLDER = "csv_data"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

# === 最新のCSVファイルを探す ===
csv_files = [f for f in os.listdir(FOLDER) if f.endswith(".csv") and "_with_info" not in f]
if not csv_files:
    raise FileNotFoundError("csv_dataフォルダに元データのCSVが見つかりませんでした")

latest_csv = max(csv_files, key=lambda x: os.path.getmtime(os.path.join(FOLDER, x)))
INPUT_FILE = os.path.join(FOLDER, latest_csv)
OUTPUT_FILE = INPUT_FILE.replace(".csv", "_with_info.csv")

print(f"📄 処理対象ファイル: {INPUT_FILE}")

# === 企業情報抽出関数 ===
def extract_company_info(info_url):
    print(f"🔍 アクセス中: {info_url}")
    try:
        r = requests.get(info_url, headers=HEADERS, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.content, 'html.parser')

        company_name = "Not Found"
        phone_number = "Not Found"

        dl_tag = soup.find("dl")
        if dl_tag:
            dt_tags = dl_tag.find_all("dt")
            for dt in dt_tags:
                dt_text = dt.get_text(" ", strip=True)
                if not dt_text:
                    continue
                pattern = r'([\s\S]{0,30}株式会社[\s\S]{0,30})(?=〒|TEL:|FAX:|代表者:|店舗運営責任者:|店舗セキュリティ責任者:|購入履歴|$)'
                match = re.search(pattern, dt_text)
                if match:
                    company_name = match.group(1).strip()
                    break

        tel_elem = soup.find(string=re.compile("TEL:"))
        if tel_elem:
            match = re.search(r'TEL:\s*([\d\-]+)', tel_elem)
            if match:
                phone_number = match.group(1)

        return company_name, phone_number
    except Exception as e:
        print(f"❌ 取得失敗: {e}")
        return "Not Found", "Not Found"

# === メイン処理 ===
with open(INPUT_FILE, 'r', encoding='utf-8-sig') as f:
    reader = csv.DictReader(f)
    rows = list(reader)

for row in rows:
    info_url = row.get("info_url")
    if info_url:
        company, tel = extract_company_info(info_url)
        row["company_name"] = company
        row["telephone"] = tel
    else:
        row["company_name"] = "Not Found"
        row["telephone"] = "Not Found"

with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8-sig') as f:
    fieldnames = ["shop_url", "info_url", "company_name", "telephone"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"\n💾 出力完了: {OUTPUT_FILE}")


📄 処理対象ファイル: csv_data\2025-04-21_04-51-10_company_info.csv
🔍 アクセス中: https://www.rakuten.co.jp/firn/info.html
🔍 アクセス中: https://www.rakuten.co.jp/asian-asian/info.html
🔍 アクセス中: https://www.rakuten.co.jp/a-k-k/info.html
🔍 アクセス中: https://www.rakuten.co.jp/qzillabymrbliss/info.html
🔍 アクセス中: https://www.rakuten.co.jp/sports-tk/info.html
🔍 アクセス中: https://www.rakuten.co.jp/patrick/info.html
🔍 アクセス中: https://www.rakuten.co.jp/lingerie-lab/info.html
🔍 アクセス中: https://www.rakuten.co.jp/season-tk/info.html
🔍 アクセス中: https://www.rakuten.co.jp/auc-lucanor/info.html
🔍 アクセス中: https://www.rakuten.co.jp/shiftwebshop/info.html
🔍 アクセス中: https://www.rakuten.co.jp/e-monoutteru/info.html
🔍 アクセス中: https://www.rakuten.co.jp/ts-khouse/info.html
🔍 アクセス中: https://www.rakuten.co.jp/harvestmarket/info.html
🔍 アクセス中: https://www.rakuten.co.jp/aries2010/info.html
🔍 アクセス中: https://www.rakuten.co.jp/f231002-nagoya/info.html
🔍 アクセス中: https://www.rakuten.co.jp/fukuske/info.html
🔍 アクセス中: https://www.rakuten.co.jp/queensway/in