## Add imports

In [19]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed

## Specify article urls

In [20]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}'
}

## Create image folder

In [21]:
img_dir = 'images'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)

In [None]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True

## Create WebScraper class

In [22]:
class WebScraper:
    # initial values to specify
    def __init__(self, selected_days, urls, year):
        self.selected_days = selected_days
        self.urls = urls
        self.year = year
        self.data = []
        self.article_id = 0

    @retry(stop=stop_after_attempt(5), wait=wait_fixed(10)) # retry if request fails, wait for 10s, try 5 times
    # make request to url to get response
    def make_request(self, url):
        response = requests.get(url, allow_redirects=True)
        response.raise_for_status()
        return response

    # convert 'day and year' to 'month and day'
    def convert_day(self, day, year):
        month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
        m = 0
        d = 0
        while day > 0:
            d = day
            day -= month_days[m]
            m += 1
        return (m, d)

    # get the image and save it to the destination folder
    def get_img(self, img_url, dest_folder, dest_filename):
        ext = img_url.split('.')[-1]
        if len(ext) > 4:
            ext = 'jpg'
        dest_filename = f'{dest_filename}.{ext}'
        with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
            f.write(requests.get(img_url, allow_redirects=False).content)
        return dest_filename

    # get the number of claps for the article
    def get_claps(self, claps_str):
        if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
            return 0
        split = claps_str.split('K')
        claps = float(split[0])
        claps = int(claps*1000) if len(split) == 2 else int(claps)
        return claps

    # main function to do the web scraping, looping through each article on each day and saving to self.data
    def scrape(self):
        i = 0
        n = len(self.selected_days)

        for d in self.selected_days: # loop through days
            i += 1
            month, day = self.convert_day(d, self.year)
            date = '{0}-{1:02d}-{2:02d}'.format(self.year, month, day)
            print(f'{i} / {n} ; {date}')

            for publication, url in self.urls.items():
                try:
                    response = self.make_request(url.format(self.year, month, day))
                except requests.exceptions.HTTPError as errh:
                    print("HTTP Error:", errh)
                    continue
                except requests.exceptions.ConnectionError as errc:
                    print("Connection Error:", errc)
                    continue
                except requests.exceptions.Timeout as errt:
                    print("Timeout Error:", errt)
                    continue
                except requests.exceptions.RequestException as err:
                    print("Request Exception:", err)
                    continue

                page = response.content
                soup = BeautifulSoup(page, 'html.parser')
                articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")

                for article in articles: # loop through articles on that day
                    title = article.find("h3", class_="graf--title")
                    if title is None:
                        continue
                    title = title.contents[0]
                    self.article_id += 1
                    subtitle = article.find("h4", class_="graf--subtitle")
                    subtitle = subtitle.contents[0] if subtitle is not None else ''
                    image = article.find("img", class_="graf-image")
                    image = '' if image is None else self.get_img(image['src'], 'images', f'{self.article_id}')
                    article_url = article.find_all("a")[3]['href'].split('?')[0]
                    if len(article.find_all("button")) == 0:
                        claps = self.get_claps('99999')
                    else:
                        claps = self.get_claps(article.find_all("button")[1].contents[0])
                    reading_time = article.find("span", class_="readingTime")
                    reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
                    responses = article.find_all("a")
                    if len(responses) == 7:
                        responses = responses[6].contents[0].split(' ')
                        if len(responses) == 0:
                            responses = 0
                        else:
                            responses = responses[0]

                    # append data to self.data
                    self.data.append([self.article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])



In [23]:
year = 2022 # specify year
selected_days = [i for i in range(1, 367 if is_leap(year) else 366)] # specify number of days (e.g. first and second day of the year would => [1,2])

In [24]:
WebScraperClass = WebScraper(selected_days, urls, year) # create WebScraper class

WebScraperClass.scrape() # scrape articles

1 / 1 ; 2022-01-01


In [25]:
data = WebScraperClass.data # save data into a 'data'

In [26]:
# only keep columns needed
medium_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle',
    'image', 'claps', 'responses',
    'reading_time', 'publication', 'date'])

In [27]:
# check shape of data
medium_df.shape

(10, 10)

In [28]:
# check data head
medium_df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/5-ways-to-deal-...,3 ways to deal with large datasets in Python,,1.jpeg,284,2,3,Towards Data Science,2022-01-01
1,2,https://towardsdatascience.com/from-supervised...,From Supervised To Unsupervised Learning: A Pa...,Slowly removing the…,2.png,261,1,6,Towards Data Science,2022-01-01
2,3,https://towardsdatascience.com/top-python-libr...,Top Python Libraries for Visualization: A Star...,"[The guide to plotting scatter plots, heat map...",,102,1,8,Towards Data Science,2022-01-01
3,4,https://towardsdatascience.com/simple-method-o...,Simple method of targeted TF-IDF topic modelin...,Using a targeted TF-IDF Topic…,4.jpg,21,"[[[]], [Kenneth Hua], [Towards Data Science], ...",5,Towards Data Science,2022-01-01
4,5,https://towardsdatascience.com/optimizing-pati...,Optimizing Patient Scheduling,Efficient Clinic Flow and Reduced…,5.png,99999,"[[[]], [Gabe Verzino], [Towards Data Science],...",10,Towards Data Science,2022-01-01


In [29]:
# write data to csv file
medium_df.to_csv('data/medium_data.csv', index=False)