In [None]:
# ライブラリーのインストール
pip install tqdm PyMuPDF pdf2image requests beautifulsoup4


In [None]:
# ライブラリーのインポート
import os
import requests
import urllib.parse
from bs4 import BeautifulSoup
import fitz

In [None]:

# pdf links の取得
def download_pdf_links(save_path: str, folder_name: str) -> None:
    urls = set()
    with open('links.txt', 'r', encoding='utf-8') as f:
        for line in f:
            urls.add(line.strip())

    for url in urls:
        download_pdf_from_url(url, save_path, folder_name)

# pdf のダウンロード
def download_pdf_from_url(url: str, save_path: str, folder_name: str) -> None:
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']

            # Handle relative paths
            if not href.startswith('http'):
                href = urllib.parse.urljoin(url, href)

            if href.endswith(".pdf"):
                download_pdf(href, save_path, folder_name)

def download_pdf(pdf_link: str, save_path: str, folder_name: str) -> None:
    response = requests.get(pdf_link)

    if response.status_code == 200:
        filename = os.path.basename(urllib.parse.urlsplit(pdf_link).path)
        folder_path = os.path.join(save_path, folder_name)

        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'wb') as f:
            f.write(response.content)

        print(f"Downloaded {filename} successfully.")

    else:
        print(f"Failed to download {pdf_link}.")

if __name__ == '__main__':
    download_pdf_links(save_path="", folder_name="pdf_files")


In [None]:

# pdf からテキストを抽出

pdf_directory = 'pdf_files'
output_file = 'data.txt'

def extract_text_from_pdf(pdf_file):
    text = ''
    with fitz.open(pdf_file) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
    return text.strip()

def main():
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('')

    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"Processing: {pdf_path}")
            
            pdf_text = extract_text_from_pdf(pdf_path)

            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(f"PDF: {filename}\n")
                f.write(pdf_text + '\n\n')

    print(f"All PDFs processed. Text saved to {output_file}")

if __name__ == "__main__":
    main()


In [None]:
# data のクリーニング

import re

input_file = 'data.txt'
output_file = 'cleaned_data.txt'

def clean_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()

    cleaned_data = re.sub(r'[^\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF\s]', '', data)

    cleaned_data = re.sub(r'\s+', ' ', cleaned_data)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_data.strip())

    print(f"Cleaned data saved to {output_file}")

if __name__ == "__main__":
    clean_data(input_file, output_file)
