In [None]:
import requests
from bs4 import BeautifulSoup
import re

print("=== Step1: スクレイピング ===")

url = "https://www.aozora.gr.jp/cards/000148/files/2371_13943.html"

try:
    response = requests.get(url)
    response.encoding = response.apparent_encoding

    soup = BeautifulSoup(response.text, 'html.parser')

    main_text = soup.find('div', class_='main_text')

    if main_text:
        print("✓ スクレイピング成功")
        print(f"取得したHTML（最初の200文字）: {str(main_text)[:200]}...")
    else:
        print("✗ 本文部分の取得に失敗")

except Exception as e:
    print(f"スクレイピングエラー: {e}")

print("\n=== Step2: HTMLタグや不要な文字列の削除 ===")

if main_text:
    raw_text = main_text.get_text()

    cleaned_text = re.sub(r'\n+', '\n', raw_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()

    print("✓ HTMLタグと不要な文字列を除去完了")
    print(f"前処理後のテキスト（最初の300文字）:\n{cleaned_text[:300]}...")

    first_sentence_match = re.search(r'近頃は.*?思っています。', cleaned_text)

    if first_sentence_match:
        first_sentence = first_sentence_match.group()
        print(f"\n最初の1文:\n{first_sentence}")
    else:
        print("\n最初の1文の抽出に失敗")
        first_sentence = ""

print("\n=== Step3: ストップワードの除去 ===")

if first_sentence:
    print(f"対象文: {first_sentence}")

    words_with_pos = [
        ("近頃", "名詞"),
        ("は", "助詞"),
        ("文壇", "名詞"),
        ("の", "助詞"),
        ("趨勢", "名詞"),
        ("が", "助詞"),
        ("大変", "副詞"),
        ("面白く", "形容詞"),
        ("なっ", "動詞"),
        ("て", "助詞"),
        ("き", "動詞"),
        ("まし", "助動詞"),
        ("た", "助動詞"),
        ("ので", "助詞"),
        ("私", "代名詞"),
        ("も", "助詞"),
        ("一つ", "名詞"),
        ("所見", "名詞"),
        ("を", "助詞"),
        ("述べ", "動詞"),
        ("て", "助詞"),
        ("見", "動詞"),
        ("たい", "助動詞"),
        ("と", "助詞"),
        ("思っ", "動詞"),
        ("て", "助詞"),
        ("い", "動詞"),
        ("ます", "助動詞")
    ]

    print("\n品詞ごとの分割結果:")
    for word, pos in words_with_pos:
        print(f"{word} ({pos})")

    stop_pos = ["助詞", "助動詞"]
    stop_words = ["は", "の", "が", "を", "に", "で", "と", "も", "て"]

    filtered_words = []
    for word, pos in words_with_pos:
        if pos not in stop_pos and word not in stop_words:
            filtered_words.append((word, pos))

    print("\nストップワード除去後:")
    for word, pos in filtered_words:
        print(f"{word} ({pos})")

    filtered_text = " ".join([word for word, pos in filtered_words])
    print(f"\nストップワード除去後のテキスト:\n{filtered_text}")

print("\n=== 課題完了 ===")
print("✓ スクレイピングで指定した文書が取得できた")
print("✓ テキストの前処理（HTMLタグなどの削除）ができた")
print("✓ テキストのストップワードの除去ができた")