In [2]:
# import libraries 

import pandas as pd
from trafilatura.sitemaps import sitemap_search
from tqdm import tqdm #creates a smart progress bar for loops
from trafilatura import fetch_url, extract
import time

In [3]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Get a list of urls from a sitemap with trafilatura
    Call sitemap_search() with input url
    """
    urls = sitemap_search(resource_url)
    return urls

In [4]:
def extract_article(url: str) -> dict:
    """
    Extract text content from a url
    Call fetch_url() function from trafilatura
    Call extract() function to get text 
    """
    downloaded = fetch_url(url) 
    article = extract(downloaded, favor_precision=True)
    
    return article

In [5]:
def create_dataset(list_of_websites: list) -> pd.DataFrame:
    """
    Create a dataframe from a list of sitemaps that is passed to get_urls_from_sitemap
    Use tqdm to initiate a counter to monitor progress
    """
    data = []
 
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        for url in tqdm(urls, desc="URLs"):
            d = {
                'url': url,
                "article": extract_article(url)
            }
            data.append(d)
            time.sleep(0.5)

    df = pd.DataFrame(data)
    df = df.drop_duplicates()
    df = df.dropna()
    # save as pandas df AND as dict
    return df, data

In [9]:
list_of_websites = ["https://hudsonhardwood.com/"]
tqdm(list_of_websites)

  0%|          | 0/1 [00:00<?, ?it/s]

<tqdm.std.tqdm at 0x7fce75db0970>

### Broken URLS:
These either have empty html tree somewhere which kills my kernel and I have to restart or just returns an empty dataframe


- https://www.fortheculturegames.us/
- https://www.kilcommonslaw.com/
- https://tourguidegames.com/
- https://www.balancebound.co/
- https://mindfulmamamentor.com/
- https://hosthelpr.com/
- https://coreptpilates.com/
- https://pinotspalette.com/danville/

In [6]:
if __name__ == "__main__":

    list_of_websites = ["https://hudsonhardwood.com/"] # our input url
    # can also process multiple websites at once- but may not be the best option
    # ["url","url"]

    df, data = create_dataset(list_of_websites) # save scraped content as pandas df and dict

URLs: 100%|██████████| 217/217 [02:45<00:00,  1.31it/s]
Websites: 100%|██████████| 1/1 [02:47<00:00, 167.23s/it]


In [7]:
print("Example of first 5 rows of output as dict:")
print(data[0:5])

Example of first 5 rows of output as dict:
[{'url': 'https://hudsonhardwood.com', 'article': "Hudson Hardwood Floors\nSpecializing in hardwood floor refinishing, restoration, and installation of wood floors and stairs in the Greater Philadelphia Area.\nFor over 20 years Hudson Hardwood Floors has used our expertise on commercial and residential hardwood floor projects. We are known for reliable service, expert craftsmanship, and dustless sanding that has earned us 5 star reviews from our clients. Whether you need hardwood floor refinishing or a hardwood floor installation, we leave your floors looking and feeling like new.\nWe can repair your old, worn out, and damaged wood flooring, restoring them to its original beauty. Or add warmth and classic good looks to your space with new hardwood flooring. Hudson Hardwood serves the Mainline, Philadelphia, Montgomery, Bucks, Chester County, & More!\nHardwood Floor Services\nWhy Choose Us?\nTestimonials\nAt Hudson Hardwood we pride ourselves o

In [26]:
print("Example of first 5 rows of output as df:")
df.head(5)

Example of first 5 rows of output as df:


Unnamed: 0,url,article
0,https://drinkyoro.com,We're interrupting your routine of dragging th...
1,https://drinkyoro.com/blogs/news,Skip to content\nWhat is Yoro?\nThe Flavors\nT...
2,https://drinkyoro.com/collections/all-products,Filter:\n4 products\nThe highest price is $60.00
3,https://drinkyoro.com/collections/buy-yoro,Filter:\n4 products\nThe highest price is $60.00
4,https://drinkyoro.com/collections/gear,Filter:\n2 products\nThe highest price is $51.00


In [8]:
df.to_csv('hudsonhardwood.csv')