In [15]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
from urllib.parse import urlparse, urljoin
# --- urlparse ---
# URLを構成要素に分解する
#例 https://user@www.example.co.jp:8080/forum/thread.php;sort=new?id=123&page=2#post-5
#スキームscheme: 'https'
#(補足: http に暗号化(SSL/TLS)を加えた安全なプロトコル)#
#ネットロックnetloc: 'user@www.example.co.jp:8080'
#(補足: ユーザー名 'user'、ドメイン 'www.example.co.jp'、ポート番号 '8080' を含みます)
#パス/path: '/forum/thread.php'
#(補足: サーバー上の /forum ディレクトリにある thread.php というリソースを指定)
#パラメータparams: 'sort=new'
#(補足: パスの最後の要素 thread.php に付随するパラメータ。この例ではソート順を指定。現代ではあまり使われません)
#クエリquery: 'id=123&page=2'
#(補足: サーバーに渡すデータ。id が 123、page が 2 であることを示します)
#フラグメントfragment: 'post-5'
#(補足: ページ内の post-5 というIDを持つ箇所（例えば特定の投稿）へのアンカーリンク)
# --- urljoin ---
# 相対URLを絶対URLに変換する
# 相対URLの例:
# '/about.html' (ルート相対URL)
# 'contact.html' (ドキュメント相対URL)
# 'images/logo.png' (親ディレクトリ相対URL)
# './styles/main.css' (親ディレクトリ相対URL)

# URLクラス
class URL:
    def __init__(self, base_url, path = '', domain = "musashino-u.ac.jp", exclude_extension_list = ['.jpg', '.jpeg', '.png', '.gif', '.css', '.js', '.ico', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.mp4', '.mp3', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.zip', '.rar', '.7z', '.tar.gz', '.pdf', '.xlsx','xls','docx', 'doc'], exclude_params_list = ['utm_source','utm_medium','utm_campaign','utm_term','utm_content','gclid','fbclid','msclkid','yclid','sessionid','sid','sessid','sort','order','sort_by','order_by','q','s','search','query','keyword','filter','color','size','price_min','price_max']):

        self.Base_url = base_url
        self.path = path
        self.domain = domain
        self.exclude_extension_list = exclude_extension_list
        self.exclude_params_list = exclude_params_list

        self.Protocol = self.get_Protocol()
        self.FQDN = self.get_FQDN()
        self.Path = self.get_Path()
        self.Domain = self.get_Domain()
        self.Params = self.get_Params()
        self.Query = self.get_Query()
        self.Fragment = self.get_Fragment()
        self.Extension = self.get_Extension()

        self.absolute_url = self.get_absolute_url()
        
    # === base_url ===
    def get_Protocol(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.scheme

    def get_FQDN(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.netloc

    def get_Path(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.path

    def get_Domain(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.hostname

    def get_Params(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.params
    
    def get_Query(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.query

    def get_Fragment(self):
        parsed_url = urlparse(self.Base_url)
        return parsed_url.fragment

    def get_Extension(self):
        extension = self.Path.lower().split('.')[-1]
        return extension
    
        # === absolute_url ===
    def get_protocol(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.scheme

    def get_fQDN(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.netloc

    def get_path(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.path

    def get_domain(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.hostname

    def get_params(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.params
    
    def get_query(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.query

    def get_fragment(self):
        parsed_url = urlparse(self.absolute_url)
        return parsed_url.fragment

    def get_extension(self):
        extension = self.get_path().lower().split('.')[-1]
        return extension

    #　=== is_valid_base_url ===

    def is_valid_params(self):
        is_valid = True
        for param in self.Params.split('&'):
            key = param.split('=')[0]
            if key in self.exclude_params_list:
                is_valid = False
                break
        return is_valid
    
    def is_valid_domain(self):
        is_valid = False
        domain = self.Domain
        if domain:
            if self.domain  in domain:
                is_valid = True
        return is_valid

    def is_valid_extension(self):
        extension = '.' + self.Extension
        is_valid = True
        for ext in self.exclude_extension_list:
            if extension == ext:
                is_valid = False
                break
        return is_valid
    
    # === is_valid_absolute_url ===
    
    
    def is_valid_absolute_domain(self):
        is_valid = False
        domain = self.get_domain()
        if domain:
            if self.domain  in domain:
                is_valid = True
        return is_valid
    
    def is_valid_absolute_params(self):
        is_valid = True
        for param in self.get_params().split('&'):
            key = param.split('=')[0]
            if key in self.exclude_params_list:
                is_valid = False
                break
        return is_valid
    
    def is_valid_absolute_query(self):
        is_valid = True
        if self.get_query():
            is_valid = False
        return is_valid
    
    def is_valid_absolute_fragment(self):
        is_valid = True
        if self.get_fragment():
            is_valid = False
        return is_valid
    
    def is_valid_absolute_extension(self):
        extension = '.' + self.get_extension()
        is_valid = True
        for ext in self.exclude_extension_list:
            if extension == ext:
                is_valid = False
                break
        return is_valid
    
    #　=== path ===
    def get_absolute_url(self):
        try:
            absolute_url = urljoin(self.Base_url, self.path)
            absolute_url = absolute_url.replace('
            ' \
            '', 'https://')
            return absolute_url
        except:
            print("規格外のURLです")
            return self.Base_url

In [None]:
url_obj = URL("https://user@www.example.co.jp:8080/forum/thread.php;sort=new?id=123&page=2#post-5")
print("https://user@www.example.co.jp:8080/forum/thread.php;sort=new?id=123&page=2#post-5")
print(url_obj.get_Protocol())
print(url_obj.get_FQDN())
print(url_obj.get_Path())
print(url_obj.get_Domain())
print(url_obj.get_Params())
print(url_obj.get_Query())
print(url_obj.get_Fragment())
print(url_obj.get_Extension())

print(url_obj.is_valid_domain())
print(url_obj.is_valid_params())
print(url_obj.is_valid_extension())

print()
url_obj2 = URL("https://www.musashino-u.ac.jp/", "/about.html")
print("https://www.musashino-u.ac.jp/ , /about.html")
print(url_obj2.get_protocol())
print(url_obj2.get_fQDN())
print(url_obj2.get_path())
print(url_obj2.get_domain())
print(url_obj2.get_params())
print(url_obj2.get_query())
print(url_obj2.get_fragment())
print(url_obj2.get_extension())

print(url_obj2.is_valid_absolute_domain())
print(url_obj2.is_valid_absolute_params())
print(url_obj2.is_valid_absolute_query())
print(url_obj2.is_valid_absolute_fragment())
print(url_obj2.is_valid_absolute_extension())

print(url_obj2.get_absolute_url())

https://user@www.example.co.jp:8080/forum/thread.php;sort=new?id=123&page=2#post-5
https
user@www.example.co.jp:8080
/forum/thread.php
www.example.co.jp
sort=new
id=123&page=2
post-5
php
False
False
True

https://www.musashino-u.ac.jp/ , /about.html
https
www.musashino-u.ac.jp
/about.html
www.musashino-u.ac.jp



html
True
True
True
True
True
https://www.musashino-u.ac.jp/about.html


In [18]:
# PAGEクラス
class PAGE:
    def __init__(self, url, target_filetype = 'text/html'):
        self.url = url
        self.target_filetype = target_filetype

        try:
            time.sleep(0.03)
            res = requests.get(self.url, timeout=10)

            res.encoding = res.apparent_encoding

            self.status_code = res.status_code
            self.headers = res.headers.get('Content-Type', '拡張子が見つかりませんでした。')
            self.pagestate = self.detarmine_pagestate()
            
            if self.is_valid_pagestate():
                self.html = res.text
                self.soup = BeautifulSoup(res.text, 'html.parser')
                self.title = self.soup.find('title').string if self.soup.find('title') else 'タイトルなし'
        except Exception as e:
            self.pagestate = f"リクエスト失敗: {e}"
            
    # === page ===

    def get_status_code(self):
        return self.status_code

    def detarmine_pagestate(self):
        if self.status_code != 200:
            self.pagestate = f"[{self.status_code}エラー] :"
        elif 'text/html' not in self.headers:
            self.pagestate =  f"[非HTML] : {self.headers}"        
        else:
            self.pagestate = "[正常]"
        return self.pagestate
    
    def get_pagestate(self):
        return self.pagestate
    
    def get_title(self):
        if self.is_valid_pagestate():
            return self.title
        else:
            return f"{self.pagestate}のためタイトルなし"
    
    def get_links(self):
        if self.is_valid_pagestate():
            links = []
            for a_tag in self.soup.find_all('a', href=True):
                href = a_tag['href']
                links.append(href)
            return links
        else:
            return f"{self.pagestate}のためリンクなし"
        
     
    # === is_valid_page ===
    def is_valid_pagestate(self):
        return self.pagestate == "[正常]"
 


In [19]:
page_obj = PAGE("https://www.musashino-u.ac.jp/")
print(page_obj.get_status_code())
print(page_obj.get_pagestate())
print(page_obj.get_title())
print(page_obj.get_links())
print(page_obj.is_valid_pagestate)

200
[正常]
武蔵野大学
['#main', '/', 'https://ef.musashino-u.ac.jp/donation/', '/access.html', '/admission/request.html', '/contact.html', '/', 'https://musashino-u.j-server.com/LUCAIMSSNU/ns/w4/jaen/', 'https://musashino-u.j-server.com/LUCAIMSSNU/ns/w4/jako/', 'https://musashino-u.j-server.com/LUCAIMSSNU/ns/w4/jazh/', '/prospective-students.html', '/students.html', '/alumni.html', '/parents.html', '/business.html', '/guide/', '/guide/', '/guide/profile/', '/guide/activities/', '/guide/campus/', '/guide/facility/', '/guide/information/', '/guide/profile/media/', '/admission/', '/admission/', '/admission/faculty/', '/admission/international_students/', '/admission/graduate_school/', '/admission/advanced_course/', '/admission/short_term_course/', '/admission/event/', '/admission/faq/', '/admission/download/', '/admission/request.html', '/basic/', '/basic/', '/basic/purpose.html', '/basic/policies/', '/basic/learning_cycle.html', '/basic/initial/', '/basic/endowment_course.html', '/basic/consort

In [20]:
#　メインコード
def scraping(start_url):
    to_scrap = [start_url]
    scraped = []
    page_dict = {}
    while to_scrap:

        #ページにアクセス
        current_url = to_scrap.pop(0)
        page_obj = PAGE(current_url)

        # エラーページはスキップ
        if page_obj.is_valid_pagestate() == False:
            scraped.append(current_url)
            print(f" {page_obj.get_pagestate()}(正常{len(page_dict)}/完了{len(scraped)}/未完了{len(to_scrap)}): {current_url}")
            continue
        # 正常ページはリンクを取得
        title = page_obj.get_title()
        urls = page_obj.get_links()

        for url in urls:
            url_obj = URL(current_url, url)
            abs_url = url_obj.get_absolute_url()
            if (url_obj.is_valid_params() and 
                url_obj.is_valid_absolute_domain() and 
                url_obj.is_valid_absolute_query() and
                url_obj.is_valid_absolute_fragment() and
                url_obj.is_valid_absolute_extension() and 
                abs_url not in scraped and 
                abs_url not in to_scrap and 
                abs_url != current_url):
                to_scrap.append(abs_url)
        scraped.append(current_url)
        page_dict[current_url] = title

        print(f" {page_obj.get_pagestate()}(正常{len(page_dict)}/完了{len(scraped)}/未完了{len(to_scrap)}): {current_url}")
        
    return page_dict

In [None]:
START_URL = "https://www.musashino-u.ac.jp/"

print(f"開始します: {START_URL}")

page_data = scraping(START_URL)

print(f"終了しました:{len(page_data)} ページを取得")

print("--- 辞書型変数の内容 ---")
for pages_data in page_data.items():
    print(pages_data)


開始します: https://www.musashino-u.ac.jp/
 [正常](正常1/完了1/未完了102): https://www.musashino-u.ac.jp/
 [正常](正常2/完了2/未完了122): https://ef.musashino-u.ac.jp/donation/
 [正常](正常3/完了3/未完了129): https://www.musashino-u.ac.jp/access.html
 [正常](正常4/完了4/未完了163): https://www.musashino-u.ac.jp/admission/request.html
 [正常](正常5/完了5/未完了164): https://www.musashino-u.ac.jp/contact.html
 [正常](正常6/完了6/未完了174): https://www.musashino-u.ac.jp/prospective-students.html
 [正常](正常7/完了7/未完了181): https://www.musashino-u.ac.jp/students.html
 [正常](正常8/完了8/未完了182): https://www.musashino-u.ac.jp/alumni.html
 [正常](正常9/完了9/未完了183): https://www.musashino-u.ac.jp/parents.html
 [正常](正常10/完了10/未完了184): https://www.musashino-u.ac.jp/business.html
 [正常](正常11/完了11/未完了204): https://www.musashino-u.ac.jp/guide/
 [正常](正常12/完了12/未完了228): https://www.musashino-u.ac.jp/guide/profile/
 [正常](正常13/完了13/未完了229): https://www.musashino-u.ac.jp/guide/activities/
 [正常](正常14/完了14/未完了229): https://www.musashino-u.ac.jp/guide/campus/
 [正常](正常15/完了15/未完了