In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os
import time

In [2]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}'
    # 'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}',
    # 'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    # 'The Writing Cooperative': 'https://writingcooperative.com/archive/{0}/{1:02d}/{2:02d}',
    # 'Data Driven Investor': 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    # 'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
    # 'Better Marketing': 'https://medium.com/better-marketing/archive/{0}/{1:02d}/{2:02d}',
}

In [3]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

In [4]:
year = 2022
selected_days = [i for i in range(1, 367 if is_leap(year) else 366)]

In [5]:
img_dir = 'images'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)

In [6]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed

data = []
article_id = 0
i = 0
n = len(selected_days)

@retry(stop=stop_after_attempt(5), wait=wait_fixed(10))
def make_request(url):
    response = requests.get(url, allow_redirects=True)
    response.raise_for_status()
    return response

for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        try:
            response = make_request(url.format(year, month, day))
        except requests.exceptions.HTTPError as errh:
            print("HTTP Error:", errh)
            continue
        except requests.exceptions.ConnectionError as errc:
            print("Connection Error:", errc)
            continue
        except requests.exceptions.Timeout as errt:
            print("Timeout Error:", errt)
            continue
        except requests.exceptions.RequestException as err:
            print("Request Exception:", err)
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        # print(articles)
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            image = article.find("img", class_="graf-image")
            image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            if len(article.find_all("button")) == 0:
                claps = get_claps('99999')
            else:
                claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])

1 / 365 ; 2022-01-01
2 / 365 ; 2022-01-02
3 / 365 ; 2022-01-03
4 / 365 ; 2022-01-04
5 / 365 ; 2022-01-05
6 / 365 ; 2022-01-06
7 / 365 ; 2022-01-07
8 / 365 ; 2022-01-08
9 / 365 ; 2022-01-09
10 / 365 ; 2022-01-10
11 / 365 ; 2022-01-11
12 / 365 ; 2022-01-12
13 / 365 ; 2022-01-13
14 / 365 ; 2022-01-14
15 / 365 ; 2022-01-15
16 / 365 ; 2022-01-16
17 / 365 ; 2022-01-17
18 / 365 ; 2022-01-18
19 / 365 ; 2022-01-19
20 / 365 ; 2022-01-20
21 / 365 ; 2022-01-21
22 / 365 ; 2022-01-22
23 / 365 ; 2022-01-23
24 / 365 ; 2022-01-24
25 / 365 ; 2022-01-25
26 / 365 ; 2022-01-26
27 / 365 ; 2022-01-27
28 / 365 ; 2022-01-28
29 / 365 ; 2022-01-29
30 / 365 ; 2022-01-30
31 / 365 ; 2022-01-31
32 / 365 ; 2022-02-01
33 / 365 ; 2022-02-02
34 / 365 ; 2022-02-03
35 / 365 ; 2022-02-04
36 / 365 ; 2022-02-05
37 / 365 ; 2022-02-06
38 / 365 ; 2022-02-07
39 / 365 ; 2022-02-08
40 / 365 ; 2022-02-09
41 / 365 ; 2022-02-10
42 / 365 ; 2022-02-11
43 / 365 ; 2022-02-12
44 / 365 ; 2022-02-13
45 / 365 ; 2022-02-14
46 / 365 ; 2022-02-

RetryError: RetryError[<Future at 0x7fe537834220 state=finished raised HTTPError>]

In [7]:
medium_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle',
    'image', 'claps', 'responses',
    'reading_time', 'publication', 'date'])

In [8]:
medium_df.shape

(5840, 10)

In [9]:
medium_df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/5-ways-to-deal-...,3 ways to deal with large datasets in Python,,1.jpeg,284,2,3,Towards Data Science,2022-01-01
1,2,https://towardsdatascience.com/from-supervised...,From Supervised To Unsupervised Learning: A Pa...,Slowly removing the…,2.png,261,1,6,Towards Data Science,2022-01-01
2,3,https://towardsdatascience.com/top-python-libr...,Top Python Libraries for Visualization: A Star...,"[The guide to plotting scatter plots, heat map...",,102,1,8,Towards Data Science,2022-01-01
3,4,https://towardsdatascience.com/simple-method-o...,Simple method of targeted TF-IDF topic modelin...,Using a targeted TF-IDF Topic…,4.jpg,21,0,5,Towards Data Science,2022-01-01
4,5,https://towardsdatascience.com/optimizing-pati...,Optimizing Patient Scheduling,Efficient Clinic Flow and Reduced…,5.png,99999,0,10,Towards Data Science,2022-01-01


In [11]:
print(medium_df[medium_df["date"] == '2022-10-30'])

        id                                                url  \
5760  5761  https://towardsdatascience.com/how-to-ingest-a...   
5761  5762  https://towardsdatascience.com/6-looker-tips-t...   
5762  5763  https://towardsdatascience.com/the-essence-of-...   
5763  5764  https://towardsdatascience.com/a-quick-guide-o...   

                                                  title  \
5760  How to Ingest and Consume Data from Azure Data...   
5761  6 Looker Tips That Will Power Up Your Next Dat...   
5762                 The essence of drawing Sankey Plot   
5763  A Quick Guide on How to Safely Store and Retri...   

                                               subtitle     image  claps  \
5760        Analysis on ingestion/consumption patterns…  5761.png     51   
5761           Explore six tips from basic to advanced…  5762.jpg    101   
5762  I would like to share two approaches to genera...  5763.png     90   
5763                                                     5764.jpg    103  

In [12]:
medium_df.to_csv('data/medium_data.csv', index=False)

In [19]:
medium_df.iloc[:300]

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/5-ways-to-deal-...,3 ways to deal with large datasets in Python,,1.jpeg,284,2,3,Towards Data Science,2022-01-01
1,2,https://towardsdatascience.com/from-supervised...,From Supervised To Unsupervised Learning: A Pa...,Slowly removing the…,2.png,261,1,6,Towards Data Science,2022-01-01
2,3,https://towardsdatascience.com/top-python-libr...,Top Python Libraries for Visualization: A Star...,"[The guide to plotting scatter plots, heat map...",,102,1,8,Towards Data Science,2022-01-01
3,4,https://towardsdatascience.com/simple-method-o...,Simple method of targeted TF-IDF topic modelin...,Using a targeted TF-IDF Topic…,4.jpg,21,0,5,Towards Data Science,2022-01-01
4,5,https://towardsdatascience.com/optimizing-pati...,Optimizing Patient Scheduling,Efficient Clinic Flow and Reduced…,5.png,99999,0,10,Towards Data Science,2022-01-01
...,...,...,...,...,...,...,...,...,...,...
295,296,https://towardsdatascience.com/dive-into-moder...,Dive into Modern Web Deployment: Quickstart Gu...,,296.jpg,151,0,5,Towards Data Science,2022-01-10
296,297,https://towardsdatascience.com/the-best-data-i...,"The Best Data is Free Data, Of Course",Access a bounty of open data with the Socrata ...,297.jpg,34,0,5,Towards Data Science,2022-01-10
297,298,https://towardsdatascience.com/the-future-of-t...,The Future of the Modern Data Stack in 2022,Featuring the 6 big ideas you should know from...,298.jpeg,1300,9,13,Towards Data Science,2022-01-11
298,299,https://towardsdatascience.com/comparing-robus...,"Comparing Robustness of MAE, MSE and RMSE",How the main regression metrics behave in the ...,299.jpeg,138,1,6,Towards Data Science,2022-01-11
