# Web Scrape
The origional dataset was missing some things so I decided to go ahead and scrape some of the article features myself. The process took a long time so I added multithreading to quicken the pace.

In [1]:
# imports
import numpy as np
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup

# Multithreading
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio

nest_asyncio.apply()

In [2]:
def fetch(session,url):
    '''Takes url and finds the data channel, publish date, article title, and article keywords.'''
    
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    global updated, col_names, n
    
    try:
        '''At some point mashable changed the style of their article urls, this checks 
        whether an error page comes up and if so makes the required replacement.'''
        
        if str(soup.findAll('h1')[0]) == '<h1>The Bad News</h1>':
            url_sub = url[:20]+'article'+url[30:]
            html = requests.get(url_sub).text
            soup = BeautifulSoup(html, 'html.parser')
    except:
        updated = updated.append(pd.DataFrame({col_names[0]:[url],
                                               col_names[1]:['Unlabeled'],
                                               col_names[2]:[np.nan],
                                               col_names[3]:[np.nan],
                                               col_names[4]:[np.nan]}).set_index('url'))
        # progress tracker
        print('\r%.3f%%' % (len(updated)/n * 100),end="")
        return
    
    # Find the labeled data channel.
    try:
        channel = soup.findAll('article',{'class':'full post story'})[0]['data-channel']
    except:
        channel = 'Unlabeled'
        
    # Find the publish date.
    try:
        date = str(soup.findAll('div',{'class':'article-info'})[0].findAll('time')[0])[16:33]
    except:
        date = np.NaN
    
    # Find the aricle title.
    try:    
        title = soup.findAll('title')[0]
    except:
        title = 'Untitled'
        
    # Fine article keywords    
    try:
        keywords = soup.findAll('footer',{'class','article-topics'})[0].findAll('a')
    except:
        keywords = np.nan
        
    updated = updated.append(pd.DataFrame({col_names[0]:[url],
                                           col_names[1]:[channel],
                                           col_names[2]:[date],
                                           col_names[3]:[title],
                                           col_names[4]:[keywords]}).set_index('url'))
    # progress tracker
    print('\r%.3f%%' % (len(updated)/n * 100),end="")


async def get_data_asynchronus(url_list):
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor,
                    fetch,
                    *(session,url),
                )
                for url in url_list
            ]
            for response in await asyncio.gather(*tasks):
                pass

def main(url_list):
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(get_data_asynchronus(url_list))
    loop.run_until_complete(future)
    print("   [Process Complete]")

In [3]:
raw = pd.read_csv("../OnlineNewsPopularity.csv")
n = len(raw)

if 'Updates.csv' in os.listdir():
    updated = pd.read_csv('Updates.csv')
    col_names = list(updated.columns)
    updated = updated.set_index('url')
    legend = raw.set_index('url').drop(index=updated.index[:]).reset_index().url
    
if 'Updates.csv' not in os.listdir():
    col_names = ['url','channel','date','title','keywords']
    updated = pd.DataFrame(columns=col_names).set_index('url')
    legend = raw.set_index('url').drop(index=updated.index[:]).reset_index().url

In [4]:
main(legend)

   [Process Complete]


In [5]:
updated.to_csv('Updates.csv')