In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}',
    'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'The Writing Cooperative': 'https://writingcooperative.com/archive/{0}/{1:02d}/{2:02d}',
    'Data Driven Investor': 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
    'Better Marketing': 'https://medium.com/better-marketing/archive/{0}/{1:02d}/{2:02d}',
}

In [3]:
# function to convert day into (month, day) format
def convert_day(day):
    month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

In [4]:
# function to convert claps to integer
def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

In [5]:
# sampling 10 out of 365 days
selected_days = random.sample([i for i in range(1, 366)], 10)

When we access an URL of the form „https://mediumpublication/archive/year/month/day” there is a chance that there is no article published on that day in that publication. In this case we are redirected to URL „https://mediumpublication/archive/year/month” which contains the top 10 most popular articles in that month, and that’s not what we want. So, whenever this happens we will just skip this page.

In [6]:
data = []
article_id = 0
year = 2021
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all(
            "div",
            class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title,
                         subtitle, claps, responses,
                         reading_time, publication, date])

1 / 10 ; 2021-04-13
2 / 10 ; 2021-08-09
3 / 10 ; 2021-03-08
4 / 10 ; 2021-03-29
5 / 10 ; 2021-01-13
6 / 10 ; 2021-12-16
7 / 10 ; 2021-12-14
8 / 10 ; 2021-12-08
9 / 10 ; 2021-11-07
10 / 10 ; 2021-10-08


In [8]:
# creating a dataframe
medium_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle', 'claps', 'responses',
    'reading_time', 'publication', 'date'])

In [10]:
medium_df.shape

(343, 9)

In [11]:
medium_df.head()

Unnamed: 0,id,url,title,subtitle,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/nine-emerging-p...,[Nine Emerging Python Libraries You Should Add...,,176,1,6,Towards Data Science,2021-04-13
1,2,https://towardsdatascience.com/numpy-basics-ch...,"NumPy Basics Cheat Sheet (2021), Python for Da...",The absolute basics for beginners learning…,77,0,6,Towards Data Science,2021-04-13
2,3,https://towardsdatascience.com/5-data-science-...,5 Data Science Open-source Projects You to Con...,Enhance your skills and up…,32,0,5,Towards Data Science,2021-04-13
3,4,https://towardsdatascience.com/exploratory-dat...,"Exploratory Data Analysis, Visualization, and ...","Using Pandas, Matplotlib…",75,1,11,Towards Data Science,2021-04-13
4,5,https://towardsdatascience.com/26-datasets-for...,26 Datasets For Your Data Science Projects,A compilation of numerous task-based datasets ...,49,0,4,Towards Data Science,2021-04-13


In [None]:
# saving file to csv
medium_df.to_csv('medium_data.csv', index=False)

## User input: Publication name

In [12]:
pub_name = "analytics-vidhya"
pub_link = "https://medium.com/" + pub_name + "/archive/{0}/{1:02d}/{2:02d}"

print(pub_link)

https://medium.com/analytics-vidhya/archive/{0}/{1:02d}/{2:02d}


In [13]:
urls = {
    'Analytics Vidhya': pub_link
}

In [14]:
data = []
article_id = 0
year = 2021
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all(
            "div",
            class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title,
                         subtitle, claps, responses,
                         reading_time, publication, date])

1 / 10 ; 2021-04-13
2 / 10 ; 2021-08-09
3 / 10 ; 2021-03-08
4 / 10 ; 2021-03-29
5 / 10 ; 2021-01-13
6 / 10 ; 2021-12-16
7 / 10 ; 2021-12-14
8 / 10 ; 2021-12-08
9 / 10 ; 2021-11-07
10 / 10 ; 2021-10-08


In [15]:
# creating a dataframe
av_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle', 'claps', 'responses',
    'reading_time', 'publication', 'date'])

In [16]:
av_df.shape

(73, 9)

In [19]:
av_df

Unnamed: 0,id,url,title,subtitle,claps,responses,reading_time,publication,date
0,1,https://medium.com/analytics-vidhya/bengaluru-...,[Bengaluru House Price Prediction],Data Science Regression Project: Predicting Ho...,16,1,6,Analytics Vidhya,2021-04-13
1,2,https://medium.com/analytics-vidhya/copying-in...,Copying in Python (with examples),Understanding shallow and deep copy in python,3,0,3,Analytics Vidhya,2021-04-13
2,3,https://medium.com/analytics-vidhya/improving-...,Improving Customer Retention using Machine Lea...,,3,0,4,Analytics Vidhya,2021-04-13
3,4,https://medium.com/analytics-vidhya/3-steps-to...,3 Steps to get you started in Stock Market Ana...,,0,0,4,Analytics Vidhya,2021-04-13
4,5,https://medium.com/analytics-vidhya/5-websites...,5 Websites Where You Can Find Free Datasets fo...,,11,0,2,Analytics Vidhya,2021-04-13
...,...,...,...,...,...,...,...,...,...
68,69,https://medium.com/analytics-vidhya/predicting...,Predicting the future using Machine Learning p...,K-means and Elbow Method + implementation in…,5,0,4,Analytics Vidhya,2021-01-13
69,70,https://medium.com/analytics-vidhya/what-is-ma...,What is Machine Learning?,,0,0,2,Analytics Vidhya,2021-01-13
70,71,https://medium.com/analytics-vidhya/emotion-cl...,Emotion classification on Twitter Data Using T...,,1,0,5,Analytics Vidhya,2021-01-13
71,72,https://medium.com/analytics-vidhya/working-on...,Working on different projects with Python3 in ...,,0,0,2,Analytics Vidhya,2021-01-13


## User Input: Tag

In [21]:
tag_name = "deep-learning"
tag_link = "https://medium.com/tag/" + tag_name + "/archive/{0}/{1:02d}/{2:02d}"

print(tag_link)

https://medium.com/tag/deep-learning/archive/{0}/{1:02d}/{2:02d}


In [22]:
urls = {
    'Tag-1': tag_link
}

In [23]:
data = []
article_id = 0
year = 2021
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all(
            "div",
            class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title,
                         subtitle, claps, responses,
                         reading_time, publication, date])

1 / 10 ; 2021-04-13
2 / 10 ; 2021-08-09
3 / 10 ; 2021-03-08
4 / 10 ; 2021-03-29
5 / 10 ; 2021-01-13
6 / 10 ; 2021-12-16
7 / 10 ; 2021-12-14
8 / 10 ; 2021-12-08
9 / 10 ; 2021-11-07
10 / 10 ; 2021-10-08


In [24]:
# creating a dataframe
DL_df = pd.DataFrame(data, columns=[
    'id', 'url', 'title', 'subtitle', 'claps', 'responses',
    'reading_time', 'publication', 'date'])

In [25]:
DL_df.shape

(134, 9)

In [26]:
DL_df

Unnamed: 0,id,url,title,subtitle,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/mastering-the-s...,[Mastering the shifts with variational autoenc...,How variational autoencoders can be used to an...,14,0,10,Tag-1,2021-04-13
1,2,https://medium.com/@hmix13/forecasting-stock-p...,Forecasting Stock Prices Using Stocker,,3,0,3,Tag-1,2021-04-13
2,3,https://towardsdatascience.com/putting-your-mo...,Putting Your Models Into Production,A guide to getting your deep learning model in...,10,0,4,Tag-1,2021-04-13
3,4,https://towardsdatascience.com/stratified-norm...,Stratified normalization: Using additional inf...,,13,1,5,Tag-1,2021-04-13
4,5,https://medium.com/offnote-labs/build-a-model-...,Build a model which can translate multiple Ind...,,50,3,10,Tag-1,2021-04-13
...,...,...,...,...,...,...,...,...,...
129,130,https://medium.com/@jean-charles-nigretto/mach...,Machine Learning in healthcare may not need as...,,0,0,4,Tag-1,2021-01-13
130,131,https://medium.com/@conanmoon/%EB%8D%B0%EC%9D%...,데이터과학 유망주의 매일 글쓰기 — 쉬어가는 시간 3–1,,0,0,11,Tag-1,2021-01-13
131,132,https://medium.com/@venturescanner/artificial-...,Artificial Intelligence 2020 Summary,,1,0,1,Tag-1,2021-01-13
132,133,https://medium.com/@eminmammadov/son-g%C3%BCnl...,[Şirkətlərin gizlilik siyasətinin sirri.],,0,0,2,Tag-1,2021-01-13
