In [6]:
# import libraries 

import pandas as pd
from trafilatura.sitemaps import sitemap_search
from tqdm import tqdm #creates a smart progress bar for loops
from trafilatura import fetch_url, extract
import time

In [18]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Get a list of urls from a sitemap with trafilatura
    Call sitemap_search() with input url
    """
    urls = sitemap_search(resource_url)
    return urls

In [17]:
def extract_article(url: str) -> dict:
    """
    Extract text content from a url
    Call fetch_url() function from trafilatura
    Call extract() function to get text 
    """
    downloaded = fetch_url(url) 
    article = extract(downloaded, favor_precision=True)
    
    return article

In [19]:
def create_dataset(list_of_websites: list) -> pd.DataFrame:
    """
    Create a dataframe from a list of sitemaps that is passed to get_urls_from_sitemap
    Use tqdm to initiate a counter to monitor progress
    """
    data = []
 
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        for url in tqdm(urls, desc="URLs"):
            d = {
                'url': url,
                "article": extract_article(url)
            }
            data.append(d)
            time.sleep(0.5)

    df = pd.DataFrame(data)
    df = df.drop_duplicates()
    df = df.dropna()
    # save as pandas df AND as dict
    return df, data

### Broken URLS:
These either have empty html tree somewhere which kills my kernel and I have to restart or just returns an empty dataframe


- https://www.fortheculturegames.us/
- https://www.kilcommonslaw.com/
- https://tourguidegames.com/
- https://www.balancebound.co/
- https://mindfulmamamentor.com/
- https://hosthelpr.com/
- https://coreptpilates.com/
- https://pinotspalette.com/danville/

In [16]:
if __name__ == "__main__":

    list_of_websites = ["https://drinkyoro.com/"] # our input url
    # can also process multiple websites at once- but may not be the best option
    # ["url","url"]

    df, data = create_dataset(list_of_websites) # save scraped content as pandas df and dict

Websites:   0%|                                           | 0/1 [00:00<?, ?it/s]
URLs:   0%|                                              | 0/32 [00:00<?, ?it/s][A
URLs:   3%|█▏                                    | 1/32 [00:00<00:21,  1.44it/s][A
URLs:   6%|██▍                                   | 2/32 [00:01<00:20,  1.49it/s][A
URLs:   9%|███▌                                  | 3/32 [00:02<00:20,  1.44it/s][A
URLs:  12%|████▊                                 | 4/32 [00:02<00:19,  1.43it/s][A
URLs:  16%|█████▉                                | 5/32 [00:03<00:19,  1.40it/s][A
URLs:  19%|███████▏                              | 6/32 [00:04<00:18,  1.40it/s][A
URLs:  22%|████████▎                             | 7/32 [00:05<00:19,  1.28it/s][A
URLs:  25%|█████████▌                            | 8/32 [00:05<00:18,  1.33it/s][A
URLs:  28%|██████████▋                           | 9/32 [00:06<00:16,  1.38it/s][A
URLs:  31%|███████████▌                         | 10/32 [00:07<00:15,  1.41it/s

In [24]:
print("Example of first 5 rows of output as dict:")
print(data[0:5])

Example of first 5 rows of output as dict:
[{'url': 'https://drinkyoro.com', 'article': "We're interrupting your routine of dragging through the day and relying on stimulants to stay in the flow. When you pick up a can, you're choosing your state of being with our all-natural, 15 calorie drink that actually tastes good. Never ride the wave. Be the wave."}, {'url': 'https://drinkyoro.com/blogs/news', 'article': 'Skip to content\nWhat is Yoro?\nThe Flavors\nThe Flavors\nSurge\nSurf\nShop\nShop\nBuy Yoro\nFitness Gear\nAbout Us\nWholesale\nFAQ\nFind A Store\nLog in\nWhat is Yoro?\nThe Flavors\nSurge\nSurf\nShop\nBuy Yoro\nFitness Gear\nAbout Us\nWholesale\nFAQ\nFind A Store\nSearch\nLog in\nCart\nItem added to your cart\nCheck out\nContinue shopping\nNews\nChoosing a selection results in a full page refresh.'}, {'url': 'https://drinkyoro.com/collections/all-products', 'article': 'Filter:\n4 products\nThe highest price is $60.00'}, {'url': 'https://drinkyoro.com/collections/buy-yoro', 'art

In [26]:
print("Example of first 5 rows of output as df:")
df.head(5)

Example of first 5 rows of output as df:


Unnamed: 0,url,article
0,https://drinkyoro.com,We're interrupting your routine of dragging th...
1,https://drinkyoro.com/blogs/news,Skip to content\nWhat is Yoro?\nThe Flavors\nT...
2,https://drinkyoro.com/collections/all-products,Filter:\n4 products\nThe highest price is $60.00
3,https://drinkyoro.com/collections/buy-yoro,Filter:\n4 products\nThe highest price is $60.00
4,https://drinkyoro.com/collections/gear,Filter:\n2 products\nThe highest price is $51.00


In [10]:
df.to_csv('yoro.csv')