# Web Scrape
When I was first looking at the original dataset, I found that many of the entries were missing a data channel label. So I decided I'd look into it myself and thought I'd scrape the dates each article was published too. 

In [1]:
# The usual
import numpy as np
import pandas as pd
from datetime import datetime
# BeautifulSoup for the Soul: web scrapping modules
import requests
from bs4 import BeautifulSoup
# Scrape was taking too long so multithreading!
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

nest_asyncio.apply()

raw = pd.read_csv("OnlineNewsPopularity.csv")

Start by setting up my definition for fetch and getting my multithreading process up

In [2]:
def fetch(session,url):
    '''Fetch uses the url of the article to get it's html. From here I can sort through and find the 
    specified channel and the date. Some of these articles didn't specify the desired information using 
    the same html as the others or have since been removed so for now I'm creating an Unlabeled channel 
    for articles without a specified data channel and using NaN for the dates. It then saves these values
    in the dictionary new_channels with the url as the key'''
    
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # It seems that the style of url changed at some point for some articles. The url from the dataset
    # includes the date but now this date has been replaced by the word 'article'. So if I initialy get  
    # the 'this page is missing' page, I make the change and see if I get anything different. 
    
    try:
        if str(soup.findAll('h1')[0]) == '<h1>The Bad News</h1>':
            url = url[:20]+'article'+url[30:]
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
    except:
        new_channels[url] = ('Unlabeled',np.NaN)
        print('\r%.3f%%' % (len(new_channels)/39644 * 100),end="")
        return
    
    # Find the labeled data channel, if it can't be found then replace the entry with "Unlabeled"
    try:
        channel = soup.findAll('article',{'class':'full post story'})[0]['data-channel']
    except:
        channel = 'Unlabeled'
        
    # Find the publish date, if it can't be found then replace the entry with "NaN"    
    try:
        date = soup.findAll('time')[0]['datetime']
    except:
        date = np.NaN
        
    # create a dictionary entry to save the results
    new_channels[url] = (channel,date)
    
    # A progress bar, just to tell you how much has been completed. 
    print('\r%.3f%%' % (len(new_channels)/39644 * 100),end="")

async def get_data_asynchronus(url_list):
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor,
                    fetch,
                    *(session,url)
                )
                for url in url_list
            ]
            for response in await asyncio.gather(*tasks):
                pass

def main(url_list):
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(get_data_asynchronus(url_list))
    loop.run_until_complete(future)
    print("   [Process Complete]")

In [3]:
# create my empty new_channel dictionary and begin the scrape!
new_channels=dict()
main(raw['url'])

100.000%   [Process Complete]


Turn that weird dictionary into a pandas dataframe so I can turn it into a csv.

In [4]:
# dict entries to list
urls = list(new_channels.keys())
channels = list(zip(*new_channels.values()))[0]
dates = list(zip(*new_channels.values()))[1]

# list to pandas dataframe
to_csv = pd.DataFrame({'url':urls,'channel':channels,'date':dates})

# sperate the time stamp into weekday and date
to_csv['weekday'] = to_csv['date'].str.split(",", n = 1, expand = True)[0]
to_csv['date'] = pd.to_datetime(to_csv['date'].str.split(",", n = 1, expand = True)[1])

# to the csv!
to_csv.to_csv('DatesAndChannels.csv',index=False)

In [5]:
to_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      39644 non-null  object
 1   channel  39644 non-null  object
 2   date     39456 non-null  object
 3   weekday  39456 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB
