In [20]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import re

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):
        return requests.get(url).text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            if "/support/solutions/articles/" in url:
                self.urls_to_visit.append(url)
            if "/support/solutions/folders/" in url:
                self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)


In [26]:
sitemap_xml = requests.get("https://about.vidio.com/post-sitemap.xml")
sitemap = xmltodict.parse(sitemap_xml.text)
url_list = list(map(lambda x: x["loc"], sitemap["urlset"]["url"]))

In [67]:
# 477
fail = []
from tqdm._tqdm_notebook import tqdm_notebook
# url = urls[0]
for url in tqdm_notebook(url_list[476:]):
    page = requests.get(url)
    if page.status_code != 200:
        print(f"FAIL {url}")
        fail.append(url)

    filename = re.search("artikel/([0-9a-zA-Z-]+)", url).group(1)
    with open(f'./data/crawl_info/{filename}.html', 'wb+') as f:
        f.write(page.content)


  0%|          | 0/5788 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [66]:
fail

[]

In [12]:

crawler = Crawler(urls=['https://support.vidio.com/'])
crawler.run()

2024-03-07 12:00:41,613 INFO:Crawling: https://support.vidio.com/
2024-03-07 12:00:44,287 INFO:Crawling: https://support.vidio.com/support/solutions/folders/43000354902
2024-03-07 12:00:46,041 ERROR:Failed to crawl: https://support.vidio.com/support/solutions/folders/43000354902
Traceback (most recent call last):
  File "/tmp/ipykernel_256388/538955814.py", line 44, in run
    self.crawl(url)
  File "/tmp/ipykernel_256388/538955814.py", line 37, in crawl
    self.add_url_to_visit(url)
  File "/tmp/ipykernel_256388/538955814.py", line 29, in add_url_to_visit
    if "/support/solutions/articles/" in url:
TypeError: argument of type 'NoneType' is not iterable
2024-03-07 12:00:46,042 INFO:Crawling: https://support.vidio.com/support/solutions/articles/43000060207-apa-itu-vidio-premier-
2024-03-07 12:00:47,509 ERROR:Failed to crawl: https://support.vidio.com/support/solutions/articles/43000060207-apa-itu-vidio-premier-
Traceback (most recent call last):
  File "/tmp/ipykernel_256388/53895581

In [15]:
len(crawler.visited_urls)

314

In [58]:
from tqdm._tqdm_notebook import tqdm_notebook
urls = list(filter(lambda x: "solutions/articles/" in x,crawler.visited_urls))
# url = urls[0]
for url in tqdm_notebook(urls):
    page = requests.get(url)
    filename = re.search("articles/([0-9a-zA-Z-]+)", url).group(1)
    with open(f'./data/crawl/{filename}.html', 'wb+') as f:
        f.write(page.content)


  0%|          | 0/264 [00:00<?, ?it/s]

In [60]:
import csv

fields = []
rows = []

with open('./data/faq.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    fields = next(csvreader)
    for row in csvreader:
        rows.append(row)


In [68]:
faq_ids = list(map(lambda x: str(re.search("articles/([0-9]+)", x).group(1)),sum(rows, [])))

In [72]:
import os
directory = os.fsencode('./data/crawl/')
crawled_ids = []    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if not filename.endswith(".html"):
        continue

    with open(f"./data/crawl/{filename}", "r") as f:
        id = str(re.search("([0-9]+)", filename).group(1))
        crawled_ids.append(id)

crawled_ids_set = set(crawled_ids)
not_exist_ids = []

for id in faq_ids:
    if id not in crawled_ids:
        print(id + " not exist")
        not_exist_ids.append(id)

43000060216 not exist
43000060213 not exist
43000676647 not exist
43000060393 not exist
43000060395 not exist
43000695111 not exist
43000701809 not exist
43000703200 not exist
43000687214 not exist
43000711808 not exist
43000703205 not exist


In [74]:
not_exist_ids

for id in tqdm_notebook(not_exist_ids):
    url = f"https://support.vidio.com/support/solutions/articles/{id}"
    page = requests.get(url)
    filename = re.search("articles/([0-9a-zA-Z-]+)", url).group(1)
    with open(f'./data/crawl/{filename}.html', 'wb+') as f:
        f.write(page.content)


  0%|          | 0/11 [00:00<?, ?it/s]