# Web Scrape
When I was first looking at the original dataset, I found that many of the entries were missing a data channel label. So I decided I'd look into it myself and scrape the data channels, as well as the publish dates, of each article.

In [38]:
# The usual
import numpy as np
import pandas as pd
from datetime import datetime
# BeautifulSoup for the Soul
import requests
from bs4 import BeautifulSoup
# Scrape was taking too long so multithreading!
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

nest_asyncio.apply()

raw = pd.read_csv("OnlineNewsPopularity.csv")

Since there's so many articles, I used multithreading to cut down some of the time

In [39]:
def fetch(session,url,n):
    '''Takes url to find the article data channel and date, if these entries cannot be found in the html, uses
    the entry 'Unlabeled' for data channel and np.Nan for the data. Saves these values in a dictionary with 
    url as the key.'''
    
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # At some point Mashable changed the style of their article urls, replacing 
    # the date with the word 'article'. In the case that the origional page does
    # not exist, makes the replacement and tries again. 
    try:
        if str(soup.findAll('h1')[0]) == '<h1>The Bad News</h1>':
            url = url[:20]+'article'+url[30:]
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')
    except:
        new_channels[url] = ('Unlabeled',np.NaN)
        print('\r%.3f%%' % (len(new_channels)/39644 * 100),end="")
        return
    
    # Find the labeled data channel. If it can't be found, use the entry "Unlabeled"
    try:
        channel = soup.findAll('article',{'class':'full post story'})[0]['data-channel']
    except:
        channel = 'Unlabeled'
        
    # Find the publish date. If it can't be found then replace the entry with np.NaN   
    try:
        #date = soup.findAll('time')[0]['datetime']
        date = str(soup.findAll('div',{'class':'article-info'})[0].findAll('time')[0])[16:33]
    except:
        date = np.NaN
        
    # Create a dictionary entry to save the results
    new_channels[url] = (channel,date)
    
    # A progress bar
    print('\r%.3f%%' % (len(new_channels)/n * 100),end="")

async def get_data_asynchronus(url_list):
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor,
                    fetch,
                    *(session,url,len(url_list))
                )
                for url in url_list
            ]
            for response in await asyncio.gather(*tasks):
                pass

def main(url_list):
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(get_data_asynchronus(url_list))
    loop.run_until_complete(future)
    print("   [Process Complete]")

In [40]:
# create my empty new_channel dictionary and begin the scrape!
new_channels=dict()
main(raw['url'])

100.000%   [Process Complete]


Turn that weird dictionary into a pandas dataframe so I can turn it into a csv.

In [52]:
# dict entries to dataframe
updated = pd.DataFrame(new_channels).T.reset_index()
updated.columns = ['url', 'data_channel','date']

# sperate the time stamp into weekday and date
updated['weekday'] = updated['date'].str.split(",", n = 1, expand = True)[0]
updated['date'] = pd.to_datetime(updated['date'].str.split(",", n = 1, expand = True)[1])

# to the csv!
updated.to_csv('Updated_Data.csv',index=False)

In [53]:
updated.head()

Unnamed: 0,url,data_channel,date,weekday
0,http://mashable.com/2013/01/07/astronaut-notre...,Entertainment,2013-01-07,Mon
1,http://mashable.com/2013/01/07/amazon-instant-...,Entertainment,2013-01-07,Mon
2,http://mashable.com/2013/01/07/crayon-creatures/,Entertainment,2013-01-07,Mon
3,http://mashable.com/2013/01/07/beewi-smart-toys/,Tech,2013-01-07,Mon
4,http://mashable.com/2013/01/07/att-u-verse-apps/,Tech,2013-01-07,Mon
