In [1]:
import time
import sys
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
webdriver_path = r'C:\Users\chromedriver.exe'

In [2]:
# Selenium

def get_profile_urls(pub_url):
    
    ''' 
        :Params:
         pub_url - URL link of a medium publication i.e. https://medium.com/search/users?q=towards%20data%20science

        :Description:
         Scrapes links related to user profiles from a publication page. i.e. https://medium.com/@kozyrkov
         
        :Returns:
         Returns a list of user names and user profile urls
         
    '''

    # Store search results
    user_names = []
    user_urls = []

    # Path to webdriver
    browser = webdriver.Chrome(webdriver_path)

    # URL to scrape
    browser.get(pub_url)
    time.sleep(1)

    # Get body
    elem = browser.find_element_by_tag_name("body")

    # No. of times to scroll
    no_of_pagedowns = 100

    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.5)
        no_of_pagedowns-=1

    # Get tag
    a_tag = browser.find_elements_by_class_name("link.link--primary.u-accentColor--hoverTextNormal")

    for a in a_tag:
        user_names.append(a.text)
        user_urls.append(a.get_attribute("href"))

    browser.quit()

    print(f'No. of usernames: {len(user_names)}')
    print(user_names[-5:])
    print(f'No. of urls: {len(user_urls)}')
    print(user_urls[-5:])
    
    return user_names, user_urls

In [3]:
# Selenium

def get_writer_profile(browser,writer_profile_df,writer_profiles_col):
    
    ''' 
        :Params:
         browser - Selenium's browser session
         writer_profile_df - A pandas dataframe where new post entries are appended to
         writer_profiles_col - List of columns names in "writer_profile_df"

        :Description:
         "get_writer_profile" extracts information from each post, creates a new entry for "writer_profile_df" and appends it.
         Any posts that fails to be extracted is ignored and an error counter is kept.
         
        :Returns:
         Returns "writer_profile_df" with new appended entries and an error count
         
    '''
    
    # Initialize List
    user_name, user_profile_desc, user_followers, top_writer_flag = [],[],[],[]
        
    # Get user_name 
    match_tag = browser.find_element_by_tag_name("h1")
    user_name.append(match_tag.text)

    # Get user_profile_desc
    match_tag = browser.find_element_by_tag_name("p")
    user_profile_desc.append(match_tag.text)

    # Get user_followers
    try:
        match_tag = browser.find_element_by_partial_link_text("Followers")
        if match_tag.text.split()[0][-1]=="K":
            user_followers.append(float(match_tag.text.split()[0][:-1]) * 1000)
        else:
            user_followers.append(float(match_tag.text.split()[0]))
    except:
        user_followers.append(float(0))
        pass

    # Get top_writer_flag
    try:
        match_tag = browser.find_elements_by_tag_name("span")
        count=1
        for match in match_tag:
            if "Top writer" in match.text:
                top_writer_flag.append(float(1))
                break
            else:
                if count==5:
                    top_writer_flag.append(float(0))
                    break
                count+=1
    except:
        pass
    
    writer_profile ={
                     'user_name': user_name,
                     'user_profile_desc': user_profile_desc,
                     'user_followers': user_followers,
                     'top_writer_flag':top_writer_flag
                    }
    
    df_mismatch = 0
    try:
        # Create new entry
        create_new_entry = pd.DataFrame(writer_profile, columns = writer_profiles_col)

        # Appends new entry to posts_df
        writer_profile_df = writer_profile_df.append(create_new_entry, ignore_index=True)
    except:
        df_mismatch+=1
        pass
    
    return writer_profile_df, df_mismatch


In [4]:
# Beautifulsoup

def get_posts(browser,posts_df,post_details_col):
        
    ''' 
        :Params:
         browser - Selenium's browser session
         posts_df - A pandas dataframe where new post entries are appended to
         post_details_col - List of columns names in "posts_df"

        :Description:
         "get_posts" extracts information from each post, creates a new entry for "posts_df" and appends it.
         Any posts that fails to be extracted is ignored and an error counter is kept.
         
        :Returns:
         Returns "posts_df" with new appended entries and an error count
         
    '''
    
    # Switch to beautifulsoup for bulk of extraction
    page_content = BeautifulSoup(browser.page_source,"html.parser")

    # Loops through each post
    df_mismatch = 0
    ends_with_yc = re.compile(r'(..\s){5}y c')
    for row in page_content.find_all('div', class_=ends_with_yc):

        # Reset list
        user_name, title, publisher, claps, date_posted, read_time= [],[],[],[],[],[]

        # Search for title
        try:
            title_tag = row.find_all('h1')[0]
            title.append(title_tag.text)
        except:
            title.append("")
            pass

        # Search for user_name and publisher_name
        try:
            pub_tag = row.find_all('span')[0].find('div')
            publisher.append(' '.join(pub_tag.text.split()[pub_tag.text.split().index("in")+1:]))
            user_name.append(' '.join(pub_tag.text.split()[:pub_tag.text.split().index("in")]))
        except:
            publisher.append(pub_tag)
            user_name.append(pub_tag)
            pass

        # Search for claps
        try:
            claps_tag = row.find_all('h4')[0].text
            if claps_tag[-1]=="K":
                claps.append(float(claps_tag[:-1]) * 1000)
            else:
                claps.append(float(claps_tag))
        except:
            # Post with no claps do not have H4 tag
            claps.append(float(0))
            pass

        # Search for date_posted and read_time
        try:
            dp_rt_tag = row.find_all('span')[3].find('div')
            dp_tag = dp_rt_tag.text.split('·')[0]
            rt_tag = float(dp_rt_tag.text.split('·')[1].split()[0])
            date_posted.append(dp_tag.strip())
            read_time.append(rt_tag)
        except:
            pass

        # Post details
        post_details = {
                        'user_name': user_name,
                        'title': title,
                        'publisher': publisher,
                        'claps': claps,
                        'date_posted': date_posted,
                        'read_time':read_time
                        }

        try:
            # Create new entry
            create_new_entry = pd.DataFrame(post_details, columns = post_details_col)

            # Appends new entry to posts_df
            posts_df = posts_df.append(create_new_entry, ignore_index=True)
        except:
            df_mismatch+=1
            pass
    
    return posts_df, df_mismatch

In [5]:
# Selenium

def extract_information(url,posts_df,writer_profile_df,post_details_col,writer_profiles_col):
    
    ''' 
        :Params:
         url - A user's profile link i.e. https://medium.com/@kozyrkov
         posts_df - A pandas dataframe where new post entries are appended to
         writer_profile_df - A pandas dataframe where new profile entries are appended to
         post_details_col - List of column names in "posts_df"
         writer_profiles_col - List of column names in "writer_profile_df"

        :Description:
         Initilizes a Selenium browser for each URL recieved to being extraction process
         
        :Returns:
         Returns "posts_df" and "writer_profile_df" with new appended entries and a consolidated error count
         
    '''
    
    # Path to webdriver
    browser = webdriver.Chrome(webdriver_path)
    
    # URL to scrape
    browser.get(url)
    time.sleep(1)

    # Get body
    elem = browser.find_element_by_tag_name("body")

    # No. of times to scroll
    no_of_pagedowns = 100

    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
        no_of_pagedowns-=1
        
    # Get posts
    posts_df, error_count_post = get_posts(browser,posts_df,post_details_col)

    # Get profiles
    writer_profile_df, error_count_profiles = get_writer_profile(browser,writer_profile_df,writer_profiles_col)
    
    error_count = error_count_post + error_count_profiles;
    
    browser.quit()
    
    return posts_df, writer_profile_df, error_count

In [6]:
# url_list = ['https://medium.com/@kozyrkov','https://medium.com/@ssrosa','https://medium.com/@neha_mangal','https://medium.com/@parulnith']

# Get profile urls
_,url_list = get_profile_urls("https://medium.com/search/users?q=towards%20data%20science")

# Set column names
writer_profiles_col = ["user_name", "user_profile_desc", "user_followers", "top_writer_flag"]
post_details_col = ["user_name", "title", "publisher", "claps", "date_posted", "read_time"]

# Initalize empty dfs
writer_profile_df = pd.DataFrame(None, columns = writer_profiles_col)
posts_df = pd.DataFrame(None, columns = post_details_col)

# Loop through URL list
t0 = datetime.now()
time_counter = 0
error_count = 0
save_state = 0
for url in url_list:
    
    time_counter += 1
    sys.stdout.write("Processed: %s / %s \r" % (time_counter, len(url_list)))
    sys.stdout.flush()
    
    posts_df, writer_profile_df, error_retrieved = extract_information(url,posts_df,writer_profile_df,post_details_col,writer_profiles_col)
    
    error_count += error_retrieved
    
    # save to csv every 50 urls
    if save_state%50==1:
        # Write to CSVs
        writer_profile_df.to_csv(r'C:\Users\tds_scrape\writer_profile_df.csv')
        posts_df.to_csv(r'C:\Users\tds_scrape\posts_df.csv')
    
    save_state+=1
    
    print("Processed: %s / %s -- Elapse Time: %s" % (time_counter,len(url_list),datetime.now()-t0))
    print(f"Errors due to input mismatch: {error_count}")
    
# Write to CSVs
writer_profile_df.to_csv(r'C:\Users\writer_profile_df.csv')
posts_df.to_csv(r'C:\Users\posts_df.csv')

No. of usernames: 319
['Hanh Duyen (Katie Meo)', 'Jeferson Machado Santos', 'Srimal Ashish', 'Walter Wiggins', 'Kristoffer Hebert']
No. of urls: 319
['https://medium.com/@hanhduyen', 'https://medium.com/@jefersonmsantos', 'https://medium.com/@srimalashish', 'https://medium.com/@walterfwiggins', 'https://medium.com/@khbrt']
Processed: 1 / 319 -- Elapse Time: 0:01:05.018769
Errors due to input mismatch: 0
Processed: 2 / 319 -- Elapse Time: 0:02:09.160244
Errors due to input mismatch: 0
Processed: 3 / 319 -- Elapse Time: 0:03:07.962388
Errors due to input mismatch: 2
Processed: 4 / 319 -- Elapse Time: 0:04:17.537913
Errors due to input mismatch: 4
Processed: 5 / 319 -- Elapse Time: 0:05:21.948715
Errors due to input mismatch: 4
Processed: 6 / 319 -- Elapse Time: 0:06:26.515560
Errors due to input mismatch: 4
Processed: 7 / 319 -- Elapse Time: 0:07:28.559767
Errors due to input mismatch: 6
Processed: 8 / 319 -- Elapse Time: 0:08:03.062549
Errors due to input mismatch: 8
Processed: 9 / 319 

Processed: 95 / 319 -- Elapse Time: 1:25:38.259483
Errors due to input mismatch: 130
Processed: 96 / 319 -- Elapse Time: 1:26:32.605970
Errors due to input mismatch: 131
Processed: 97 / 319 -- Elapse Time: 1:27:04.531226
Errors due to input mismatch: 132
Processed: 98 / 319 -- Elapse Time: 1:27:36.932995
Errors due to input mismatch: 133
Processed: 99 / 319 -- Elapse Time: 1:28:31.527956
Errors due to input mismatch: 134
Processed: 100 / 319 -- Elapse Time: 1:29:27.024595
Errors due to input mismatch: 135
Processed: 101 / 319 -- Elapse Time: 1:30:02.262436
Errors due to input mismatch: 137
Processed: 102 / 319 -- Elapse Time: 1:30:54.428109
Errors due to input mismatch: 138
Processed: 103 / 319 -- Elapse Time: 1:31:27.266435
Errors due to input mismatch: 140
Processed: 104 / 319 -- Elapse Time: 1:32:23.740147
Errors due to input mismatch: 142
Processed: 105 / 319 -- Elapse Time: 1:33:19.115473
Errors due to input mismatch: 144
Processed: 106 / 319 -- Elapse Time: 1:34:09.183783
Errors 

Processed: 191 / 319 -- Elapse Time: 2:48:29.624833
Errors due to input mismatch: 274
Processed: 192 / 319 -- Elapse Time: 2:49:24.541068
Errors due to input mismatch: 275
Processed: 193 / 319 -- Elapse Time: 2:50:21.267294
Errors due to input mismatch: 275
Processed: 194 / 319 -- Elapse Time: 2:51:13.457708
Errors due to input mismatch: 277
Processed: 195 / 319 -- Elapse Time: 2:52:06.383687
Errors due to input mismatch: 278
Processed: 196 / 319 -- Elapse Time: 2:53:01.428892
Errors due to input mismatch: 279
Processed: 197 / 319 -- Elapse Time: 2:53:56.891644
Errors due to input mismatch: 281
Processed: 198 / 319 -- Elapse Time: 2:54:48.300434
Errors due to input mismatch: 282
Processed: 199 / 319 -- Elapse Time: 2:55:43.437994
Errors due to input mismatch: 284
Processed: 200 / 319 -- Elapse Time: 2:56:40.620937
Errors due to input mismatch: 284
Processed: 201 / 319 -- Elapse Time: 2:57:35.157076
Errors due to input mismatch: 285
Processed: 202 / 319 -- Elapse Time: 2:58:27.828613
Er

Processed: 287 / 319 -- Elapse Time: 4:10:45.776108
Errors due to input mismatch: 395
Processed: 288 / 319 -- Elapse Time: 4:11:36.896672
Errors due to input mismatch: 396
Processed: 289 / 319 -- Elapse Time: 4:12:28.043997
Errors due to input mismatch: 397
Processed: 290 / 319 -- Elapse Time: 4:13:03.971755
Errors due to input mismatch: 398
Processed: 291 / 319 -- Elapse Time: 4:13:57.976771
Errors due to input mismatch: 398
Processed: 292 / 319 -- Elapse Time: 4:14:53.447452
Errors due to input mismatch: 400
Processed: 293 / 319 -- Elapse Time: 4:15:46.017188
Errors due to input mismatch: 402
Processed: 294 / 319 -- Elapse Time: 4:16:42.312380
Errors due to input mismatch: 404
Processed: 295 / 319 -- Elapse Time: 4:17:38.342091
Errors due to input mismatch: 406
Processed: 296 / 319 -- Elapse Time: 4:18:34.419457
Errors due to input mismatch: 406
Processed: 297 / 319 -- Elapse Time: 4:19:08.669580
Errors due to input mismatch: 407
Processed: 298 / 319 -- Elapse Time: 4:20:03.320846
Er

In [7]:
writer_profile_df

Unnamed: 0,user_name,user_profile_desc,user_followers,top_writer_flag
0,Will Koehrsen,"Data Scientist at Cortex Intel, Data Science C...",30000.0,0.0
1,Cassie Kozyrkov,"Head of Decision Intelligence, Google. ❤️ Stat...",29000.0,1.0
2,Jeff Hale,I write about data science. Join my Data Aweso...,13100.0,1.0
3,Parul Pandey,Data Science+Community+Evangelism @H2O.ai,12700.0,0.0
4,TDS Team,"Sharing concepts, ideas, and codes. Learn more...",11000.0,0.0
...,...,...,...,...
286,Hanh Duyen (Katie Meo),Toward Data Science for Social Goods. Founder ...,3.0,0.0
287,Jeferson Machado Santos,Currently making a career shift toward data sc...,1.0,0.0
288,Srimal Ashish,Toward the world of Data Science. Machine Lear...,42.0,0.0
289,Walter Wiggins,"BWH/Harvard radiology resident, MGH-BWH Center...",116.0,0.0


In [8]:
posts_df

Unnamed: 0,user_name,title,publisher,claps,date_posted,read_time
0,Will Koehrsen,The Poisson Distribution and Poisson Process E...,Towards Data Science,6800.0,"Jan 21, 2019",14.0
1,Will Koehrsen,A Data Science Conversation,Towards Data Science,172.0,Mar 10,3.0
2,[[[Will Koehrsen]]],"12 Lessons from 55,000 pages of books",[[[Will Koehrsen]]],1100.0,Jan 3,14.0
3,[[[Will Koehrsen]]],Books of 2019,[[[Will Koehrsen]]],413.0,Jan 1,58.0
4,[[[Will Koehrsen]]],“Just Do It” Won’t Get You to Your Goals,[[[Will Koehrsen]]],338.0,"Dec 27, 2019",12.0
...,...,...,...,...,...,...
5519,[[[Kristoffer Hebert]]],Intro to React — Part 1,[[[Kristoffer Hebert]]],0.0,"Aug 30, 2015",2.0
5520,[[[Kristoffer Hebert]]],What’s new in ES6 Javascript,[[[Kristoffer Hebert]]],2.0,"Aug 17, 2015",2.0
5521,[[[Kristoffer Hebert]]],Jasmine BDD with Karma,[[[Kristoffer Hebert]]],3.0,"Jul 19, 2015",3.0
5522,[[[Kristoffer Hebert]]],Intro to AngularJS,[[[Kristoffer Hebert]]],0.0,"Jun 28, 2015",2.0
