<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scraping-Individual-Channel-Pages" data-toc-modified-id="Scraping-Individual-Channel-Pages-1">Scraping Individual Channel Pages</a></span><ul class="toc-item"><li><span><a href="#Develop-the-scraping-function-using-podcasts-listed-on-the-homepage-directory." data-toc-modified-id="Develop-the-scraping-function-using-podcasts-listed-on-the-homepage-directory.-1.1">Develop the scraping function using podcasts listed on the homepage directory.</a></span></li></ul></li></ul></div>

## Scraping Individual Channel Pages

Each channel will become a row in the dataframe used in the linear regression.
Each channel will have a popularity-related target column, as well as various features.

In [53]:
from bs4 import BeautifulSoup
import requests
from collections import defaultdict
import json
import random
import pandas as pd

### Develop the scraping function using podcasts listed on the homepage directory.

In [31]:
# Read json of channel listings
path = '../scraped/category/cb_home_dir.json'


def load_chan_json(path):
    with open(path, 'r') as file:
        chan_list = json.load(file)
    return chan_list

chan_list = load_chan_json(path)

# load sample
test_k = (random.sample(chan_list.keys(), 1))[0]
test_v = chan_list[test_k]
print(test_k, test_v)
print('number of channels: ', len(chan_list.keys()))
print(type(chan_list))

The Cardone Zone {'chan_url': 'https://castbox.fm/channel/The-Cardone-Zone-id2215630', 'author': 'Grant Cardone', 'category': 'Business'}
number of channels:  70
<class 'dict'>


In [40]:
# chosen samples for testing channel scrape
t_k_1 = 'Code Switch'
t_v_1 = {'chan_url': 'https://castbox.fm/channel/Code-Switch-id431983',
         'author': 'NPR', 'category': 'News'}

t_k_2 = 'democracy-ish'
t_v_2 = {'chan_url': 'https://castbox.fm/channel/democracy-ish-id2210997',
         'author': 'DCP Entertainment', 'category': 'News'}

t_k_3 = 'Your Undivided Attention'
t_v_3 = {'chan_url': 'https://castbox.fm/channel/Your-Undivided-Attention-id2093688',
         'author': 'Center for Humane Technology', 'category': 'Technology'}

t_k_4 = 'The Cardone Zone'
t_v_4 = {'chan_url': 'https://castbox.fm/channel/The-Cardone-Zone-id2215630',
         'author': 'Grant Cardone', 'category': 'Business'}

test_sites = [[t_k_1, t_v_1], [t_k_2, t_v_2], [t_k_3, t_v_3], [t_k_4, t_v_4]]

In [None]:
def scrape_chan_soup(chan_url):
    '''
    Request page from site.
    
    TODO: Perhaps upgrade to a Selenium-based scrape to capture
    podcast episode listings that only scroll into view when 
    
    '''
    
    return requests.get(chan_url).text

chan_html = {}
for site in test_sites:
    chan_name = site[0]
    chan_dict = site[1]
    chan_url = chan_dict['chan_url']
    chan_dict['html'] = scrape_chan_soup(chan_url)
    chan_html[chan_name] = chan_dict
# with open()

In [54]:
# save test scrape as json
with open("../scraped/channel/four_sample_pages.json", "w") as file:
    json.dump(chan_html, file)
    
# Test that channel file is saved and read properly
with open("../scraped/channel/four_sample_pages.json", "r") as file:
    sample_chans = json.load(file)

print('channels loaded: ', len(sample_chans))

channels loaded:  4


In [111]:
# Scrape a single channel page.
def process_channel_soup(chan_url, html):
    '''
    
    Build features from scraped html.
    
    Return a dictionary with features.

    '''
    
    f = {}
    
    soup = BeautifulSoup(html, 'lxml')
    
    ##### Individual Channel Features #####
    
    # channel title for validation
    f['title'] = soup.find(class_='ch_feed_info_title').find('span')
    
    f['chan_url'] = chan_url
    
    f['num_comments'] = (soup.find(class_='commentList-title')
                         .find('span').text.split('\xa0')[-1].replace('(',')',2))
    
    # channel author
    f['author'] = soup.find(class_='author').text.split(':')[-1].strip().replace(',','')
    
    # if the channel has the isExplicit class (I believe this global label
    # is applied if any of podcasts are marked as 'E')
    f['isExplicit'] = int(bool(soup.find_all('h1', {'class': 'isExplicit'})))

    # subscriber count
    f['sub_count'] = int(soup.find(class_='sub_count').text.split(':')[-1].strip().replace(',',''))
    
    # total channel plays for all episodes
    f['play_count'] = int(soup.find(class_='play_count').text.split(':')[-1].strip().replace(',',''))
    
    # all listed social feeds, including channel website
    f['ch_feed-socials'] = [a.get('href') for a in soup.find(class_='ch_feed-socials').find_all('a')]
    
    # episode count
    f['ep_total'] = int(soup.find(class_='trackListCon_title').text.split('\xa0')[0].strip().replace(',',''))
    
    # grab all (visible) episode rows
    visible_eps = soup.find_all(class_='ep-item')
    recent_eps = []
    
    # iterate through all visible episodes and grab basic info
    for ep in visible_eps:
        ep_name = ep.find('span', class_='ellipsis').text
        ep_date = ep.find('span', class_='date').text
        ep_len = ep.find('span', class_='time').text
        favs = ep.find_all(class_='heart')
#         print(favs[0].parent)
        if len(favs) > 0:
            ep_favs = int(favs[0].parent.text)
        else:
            ep_favs = 0
        recent_eps += [[ep_date, ep_len, ep_favs]]
        
    f['recent_eps'] = recent_eps
    
    #### TEXT BASED FEATURES ####
    
    # grab all of the hover text for all episodes: ep-item-desmodal-con
    f['hover_text_concat'] = ' | '.join([s.text for s in soup.find_all(class_='ep-item-desmodal-con')])
    
    # channel description
    f['chan_desc'] = soup.find(class_='des-con').text
    
    
    f['cover_img_url'] = soup.find(class_='coverImgContainer').find('img').get('src')
    
    return f
    
    
# test
records = {}

for chan_name, chan_sub_dict in sample_chans.items():
    html = chan_sub_dict['html']
    chan_url = chan_sub_dict['chan_url']
#     print(html)
    features_dict = process_channel_soup(chan_url, html)
    records[chan_name] = features_dict

records
# pd.DataFrame.from_dict(features_dict,orient='index').reset_index()

{'Code Switch': {'title': <span class="ellipsis" style="display:inline-block">Code Switch</span>,
  'chan_url': 'https://castbox.fm/channel/Code-Switch-id431983',
  'num_comments': ')51)',
  'author': 'NPR',
  'isExplicit': 0,
  'sub_count': 29615,
  'play_count': 429283,
  'ch_feed-socials': ['https://twitter.com/nprcodeswitch',
   'https://www.facebook.com/NPRCodeSwitch',
   'http://www.npr.org/sections/codeswitch/'],
  'ep_total': 184,
  'recent_eps': [['2019-09-25', '00:36:06', 6],
   ['2019-09-18', '01:01:01', 9],
   ['2019-09-11', '00:30:19', 10],
   ['2019-09-04', '00:25:26', 6],
   ['2019-08-28', '00:36:22', 8],
   ['2019-08-21', '00:32:45', 5],
   ['2019-08-14', '00:38:32', 6],
   ['2019-08-07', '00:27:53', 7],
   ['2019-07-31', '00:25:42', 7],
   ['2019-07-24', '00:19:32', 10]],
  'hover_text_concat': 'How is it that the party of Lincoln became anathema to black voters? It\'s a messy story, exemplified in the doomed friendship between Richard Nixon and his fellow Republican, 

In [None]:
# Scrape an entire dictionary    
def scrape_chan_pages(chan_dict):
    '''
    
    Iterate through a dictionary of podcast listings,
    and for each podcast in that listing, scrape its channel page.
    
    Return a dictionary with podcasts as keys,
    and scraped content as values.
    
    '''
    
    for key, val in chan_dict.items():
        
        

In [60]:
# dataframe from dict test
pd.DataFrame.from_dict(chan_list,orient='index').reset_index()

Unnamed: 0,index,chan_url,author,category
0,A Conversation With...,https://castbox.fm/channel/A-Conversation-With...,Philip DeFranco,Technology
1,A Slob Comes Clean,https://castbox.fm/channel/A-Slob-Comes-Clean-...,Dana K. White,Kids & Family
2,Adventure of The Month - Kids and Children,https://castbox.fm/channel/Adventure-of-The-Mo...,"Amr Al-Hariri, MD",Kids & Family
3,Armchair Expert with Dax Shepard,https://castbox.fm/channel/Armchair-Expert-wit...,Armchair Umbrella,Comedy
4,Around the NFL,https://castbox.fm/channel/Around-the-NFL-id10...,NFL,NFL 2019
5,Bad In The Boondocks,https://castbox.fm/channel/Bad-In-The-Boondock...,Stan and Jeru,Editors' Picks
6,Bardstown,https://castbox.fm/channel/Bardstown-id2239842,VAULT Studios,Top Shows
7,Blank Check with Griffin & David,https://castbox.fm/channel/Blank-Check-with-Gr...,Audioboom,TV & Film
8,Business Casual,https://castbox.fm/channel/Business-Casual-id2...,Morning Brew,Business
9,Business Wars,https://castbox.fm/channel/Business-Wars-id115...,Wondery,Business


In [70]:
int(True)

1