A garbage scratchpad for scraping the contents of podcasts.joerogan.net before it gets taken down.

Want to scrape:
* audio file
* episode number
* publish date
* guest name
* guest description
* related links
* youtube slug

This stuff would only come from YouTube, maybe the API, maybe the pages themselves:
* length of youtube video (would help in calculating timestamp offset vs. mp3's)
* like/dislike count
* like/dislike ratio
* number of comments
* view count
* youtube timestamps, if present (sometimes in top comments)
* list of JRE clip excerpts


Important URL formats for the podcast site (from watching debugger->network->XHR)

curl http://podcasts.joerogan.net/podcasts/page/3?load

curl -s 'http://podcasts.joerogan.net/wp-admin/admin-ajax.php?action=loadPermalink&slug=bob-saget'



In [1]:
import lxml.html
import datetime
import requests
import os.path
import random
import glob
import json
import time

import pandas as pd

In [2]:
# Print full dataframes whenever I dump them.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
directory = "indexhtml/" + datetime.datetime.now().strftime("%Y%m%d")
!mkdir -p {directory}

In [4]:
# Scrape the website index. 
# Just save off the HTML for now. Random sleep means
# we get ~6 req/min, so about 20 minutes total for the full thing.

# Hacky thing for not scraping the full thing on each update.
FULL = 170
UPDATE = 3
scrape = UPDATE

base = "http://podcasts.joerogan.net/podcasts/page/%d?load"

for x in range(1, scrape+1):
    try:
        r = requests.get(base % x)
        r.raise_for_status()
        fn = directory + "/%03d.html" % x
        with open(fn, 'w') as f:
            f.write(r.text)
            time.sleep(random.randint(2,5))
            
    except Exception:
        print(x)

In [5]:
fields = "type episode date guests description slug mp3 yt ytid".split()

# A series of fields, their xpaths, and functions to normalize them.
paths = {
    'episode': ('./ul[@class="podcast-icons"]//li/@data-episode-num[1]', lambda x:str(x[0])),
    'date' : ('./div[@class="podcast-date"]/h3/text()', lambda x: datetime.datetime.strptime(x[0], "%m.%d.%y").date()),
    'guests' : ('./div[@class="podcast-details"]/a[@class="ajax-permalink"]/h3/text()', lambda x: str(x[0])),
    'description' : ('./div[@class="podcast-details"]/div[@class="podcast-content"]//text()', lambda x: ''.join(x)),
    'slug' : ('(./div[@class="podcast-details"]/a[@class="ajax-permalink"])/@data-slug', lambda x: str(x[0])),
}


In [6]:
# Now extract the fields. Just put them into dicts for now; we'll chuck them in a dataframe later.
# Put those dicts into a dict, using the slug value as the key, so we can avoid duplicates when updating.

# Note that about 1-2% of the time, when you request any given page, their website returns page #1.
# It's very strange. Luckily, this method of updating solves that problem.

rows = dict()

# Aaaand go figure, around the time Miley Cyrus's episode was published, the managed to woefully
# break the HTML on page 1, nesting all the other episodes in the list under the first one.
# Well, for now, fuck it.
for fn in reversed(sorted(glob.glob("indexhtml/202008*/*.html"))):

    tree = lxml.html.parse(fn)
    episode_list = tree.find('//div[@id="podcasts"]//div[@class="main"]')
    episodes  = episode_list.xpath('./div[contains(@class, "episode")]')

    for ep in episodes:
        d = {k:None for k in fields}
        for f, (x,t) in paths.items():
            d[f] = t(ep.xpath(x))

        # Most episodes have a download link in these index pages.
        # However, the placement for these is kinda all over, so
        # we grab all the links and look for one that smells like
        # a match. The ones that don't have it, we can get from the
        # individual pages in another pass. (Just means more scraping.)

        links = ep.xpath('.//@href')
        mp3 = [l for l in links if 'libsyn.com' in l and d['episode'] in l]

        # If we find precisely one good link, save it.
        if len(mp3) == 1:
            d['mp3'] = mp3[0]

        rows[d['slug']] = d

In [7]:
df = pd.DataFrame(columns=fields, data=rows.values())

# Sorting this way helps things in the right order. Episodes increase with release date,
# though sometimes multiple come out on one day, and sometimes a single episode has pt1 and pt2.
df = df.sort_values(['date', 'episode', 'slug']).reset_index(drop=True)

In [8]:
# Turns out the show numbers aren't always numbers.
# There are a few oddballs, gaps, etc. and the MMA
# shows mess it up too.

# Try to convert every episode tag into a real number,
# possibly just a placeholder 0.
df['normalized'] = df.episode.str.extract('(\d*)')
df.normalized = '0' + df.normalized
df.normalized = df.normalized.astype(int)

df['standard'] = None

In [9]:
# Now we go through, and look for a monotonically increasing ep count.
# If it fits in the monotonic series, then we're counting it as part of the set of standard podcasts.

ep = 1
skipped = []

for i in range(len(df)):
    x = df.loc[i,'normalized']
    
    # If we skipped some numbers, save that.
    if x > ep: skipped.extend(range(ep, x))
    
    # If the current number is aligned with what you're loking for,
    # mark this podcast as a regular one. Otherwise, mark as not.
    y = x >= ep
    df.loc[i,'standard'] = y
    
    # If we found a regular podcast, start looking for the next on in the series.
    if y: ep = x + 1
        
        
print("skipped:", skipped)

skipped: [97, 108, 128, 172, 213, 677, 1036, 1093, 1117, 1366, 1423, 1440, 1477]


In [10]:
# Sanity check. Make sure the ones we got, plus the ones
# we skipped equals the number of regular podcasts
# released to date. Yup.

len(df[df.standard]) + len(skipped)

1527

In [11]:
# These seem to be missing entirely. Not on the website or YouTube: 97, 108, 128, 213
# These ones are only on YouTube: 1440.
# These are in the data, but the number was left blank: 1093, 1423, 1477
# 677 was labeled as 77.
# 172 was labeled as 173.
# 1036 was labeled as 1037.
# 1117 is also labeled as MMA show 27, even though there is another MMA show #27.
# 1366 was labeled as 1336

# I've seen other cases (eg. 1524) where it was initially only on YouTube, but added to
# the podcast site a few days later. I'm kinda surprised they don't do them at the same time.

# Also, this ones are clearly two part shows: 515, 701
# 706 also comes in two parts, and is titled as a fight companion(?), literally with a question mark
# so I'm putting that one in the regular list

# FC9 was labeled as a second 8.

# There's no FC14 on the website, but there is a July 11th, 2015 episode on YouTube. v=mgumau1KiRw

# There are two Fight Companions labeled as #35, throwing the later counts off by one.
# But shit, they seem to have really stuck to their guns on the bad numbering, so we'll do the same.

# FC38 was labeled as another 36, *after* 37 had been done. Wow.


# ---------- Other random shit
# 1423 (andrew-doyle) and 1477 (tony-hawk) don't have any download link on the website either.
# Doyle has the YouTube link, Hawk has neither, though his episode is on YouTube.


In [12]:
# The above corrections, plus some others.
# Set the unique slug for each podcast alongside the number it *should* have,
# and for the ones that are are not standard podcasts, also include the the type.

corrections = [
    ('podcast-172', 172),
    ('podcast-173-peter-joseph-brian-redban', 173),
    ('ari-shaffir-5', 515),
    ('josh-zepps-2', 677),
    ('honey-honey-part-2', 701),
    ('ari-shaffir-bert-kreischer-tom-segura-2', 1036),
    ('chris-kresser-2', 1037),
    ('owen-benjamin-kurt-metzger', 1093),
    ('jre-mma-show-27-with-tim-kennedy', 1117),
    ('richard-dawkins', 1366),
    ('andrew-doyle', 1423),
    ('tony-hawk', 1477),
    ('fight-companion-feb-14-2015', 9, "fc"),
    ('fight-companion-january-14-2018', 38, "fc"),
    ('brendan-schaub-fight-companion-part-2', 706),
]

for corr in corrections:
    slug, ep = corr[0:2]
    df.loc[df.slug==slug, 'episode'] = str(ep)
    if len(corr)==2:
        df.loc[df.slug==slug, 'type'] = 'podcast'
    else:
        df.loc[df.slug==slug, 'type'] = corr[2]
        

In [13]:
# Now start blocking these off into categories.
df.loc[df.standard, 'type'] = "podcast"

In [14]:
df.loc[df.type.isnull() & df.episode.str.match('JRQE'), 'type'] = 'jrqe'
df.loc[df.type.isnull() & df.episode.str.contains('FC'), 'type'] = 'fc'
df.loc[df.type.isnull() & df.slug.str.contains('companion'), 'type'] = 'fc'
df.loc[df.type.isnull() & df.guests.str.contains('MMA'), 'type'] = 'mma'
df.loc[df.type.isnull() & df.episode.str.match('\D'), 'type'] = 'misc'

In [15]:
# What's left?
df.loc[df.type.isnull()]

Unnamed: 0,type,episode,date,guests,description,slug,mp3,yt,ytid,normalized,standard
763,,,2015-12-22,UFC Recap – Brendan Schaub & Eddie Bravo,"Joe discusses some of the fights from the UFConFOX fight card from December 19, 2015, and other topics, with Brendan Schaub & Eddie Bravo.",ufc-recap-brendan-schaub-eddie-bravo,http://traffic.libsyn.com/joeroganexp/p122115.mp3,,,0,False
1001,,,2017-05-15,Podcast On A Plane UFC 211 Recap,"Joe sits down with Tony Hinchcliffe on a plane to discuss the weekend's events at UFC 211 in Dallas, TX.",podcast-on-a-plane-ufc-211-recap,,,,0,False


In [16]:
# Now there are just two "UFC recap" shows popping up.
# They don't seem to be in any standard numbering, so into misc they go.
df.loc[df.type.isnull(), 'type'] = 'misc'

# Assign a number to all these motherfuckers. Wipe out the text based names.
misc = len(df[df.type=='misc'])
df.loc[df.type=="misc", "episode"] = list(range(1, misc+1))

In [17]:
# Strip all the text from the fight companion episodes
df.loc[df.type=="fc", 'episode'] = df.loc[df.type=="fc", 'episode'].str.extract('(\d+)').values

# Ditto the jrqe ones.
df.loc[df.type=="jrqe", 'episode'] = df.loc[df.type=="jrqe", 'episode'].str.extract('(\d+)').values

In [18]:
# Now turn the episodes into integers, index, and drop temp columns.
df.episode = df.episode.astype(int)

df.set_index(['type','episode'], inplace=True)
del df['normalized']
del df['standard']

In [19]:
# What do we have so far?
#df

In [20]:
!mkdir -p slugs

In [21]:
# Ah, fuck, it, I'll scrape every individual page, even if I already
# have the MP3 link. Why not? Might be useful later.

base = "http://podcasts.joerogan.net/wp-admin/admin-ajax.php?action=loadPermalink&slug=%s"

for x in df.slug.values:
    fn = "slugs/%s.html" % x
    
    # Since these slugs are unique, this is easier to make update without reptition.
    if os.path.exists(fn):
        continue
        
    try:
        r = requests.get(base % x)
        r.raise_for_status()
        with open(fn, 'w') as f:
            f.write(r.text)
            time.sleep(random.randint(2,5))
            
    except Exception:
        print(x)

In [22]:
paths = {
    'ytid' : ('.//a[@data-video-provider="youtube"]/@data-video-id', lambda x: x[0] if x else None),
    'mp3' : ('.//a[@class="download-episode"]/@href', lambda x: x[0] if x else None)
}

# change index for performance reasons here.
df = df.reset_index()
df = df.set_index('slug', drop=False)

for fn in glob.glob('slugs/*html'):

    slug = fn.partition('/')[2].replace('.html','')
    html = json.load(open(fn))['response']['html']
    tree = lxml.html.fromstring(html)
    
    d = {k:None for k in paths}
    for f, (x,t) in paths.items():
        d[f] = t(tree.xpath(x))

        # For everything we extract from the individual page, see if it lines up
        # with the index listings.
        fromindex = df.loc[slug,f]
        if fromindex:
            if fromindex != d[f]:
                print(df.loc[slug, ['type','episode']])
                print(fromindex, d[f])
                print()
                
        # In either case, save what we found on the individual pages.
        df.loc[slug, f] = d[f]

        
# Fix index up again.
df = df.set_index(['type','episode'])

# Looks like exactly one podcast had a broken "download this podcast" link.
# Whereas the link to play the podcast is correct. Not sure if that was a fatfingered typo
# or if the podcast was edited later for some reason and the link was only updated once.
# Anyway, we saved the correct one.

type       podcast
episode        468
Name: duncan-trussell-christopher-ryan-2, dtype: object
http://traffic.libsyn.com/joeroganexp/p468.mp3 http://traffic.libsyn.com/joeroganexp/p468a.mp3



In [23]:
# Turns out there's a bunch with broken YouTube links - not just old stuff (that was on Vimeo) either,
# but new stuff as well.
print(len(df[df.ytid.isnull()]))
#df[df.ytid.isnull()]

165


In [24]:
# Coming up with two where the link to play the MP3 on the site is busted. Seemingly missing.
# Luckily, they're both on YouTube, even if Tony Hawk's YT link isn't on the podcast site either.
# Especially since Andrew Doyle is a funny motherfucker.
df[~df.mp3.astype(bool)]

Unnamed: 0_level_0,Unnamed: 1_level_0,date,guests,description,slug,mp3,yt,ytid
type,episode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
podcast,1423,2020-02-05,Andrew Doyle,"#1423. Andrew Doyle is a British comedian, playwright, journalist, political satirist and is creator of the fictitious character Titania McGrath. The new book ""Woke: A Guide to Social Justice"" by Titania McGrath is now available: https://amzn.to/36X2GoG",andrew-doyle,,,NIxhH85cQMY
podcast,1477,2020-05-20,Tony Hawk,"#1477. Tony Hawk is a professional skateboarder, actor, stuntman, and the owner of the skateboard company Birdhouse.",tony-hawk,,,


In [25]:
# Save this shit for later.
df.to_csv('rogan-scrape.csv')

In [26]:

doubles = [515, 701, 706] # Handle these manually, because software engineering is digital garbage collection


i=0
f = open('dl-jre.sh','w')
for (t, ep), row in df.iterrows():
    fn = "%s%04d.mp3" % (t, ep)
    
    if ep in doubles or not row.mp3:
        continue
    
    if not os.path.exists('/media/sshfs/jre/podcast/%s' % fn):
        i+=1
        print('wget "%s" -O %s' %(row.mp3, fn), file=f)
        
f.close()
!mv dl-jre.sh /media/sshfs/jre/podcast

In [27]:
# Downloading these manually to get the in the right spots.
df.loc[(slice(None),doubles), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,date,guests,description,slug,mp3,yt,ytid
type,episode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
podcast,515,2014-06-27,Ari Shaffir,"#515. (Part 1) Ari Shaffir is a stand-up comedian and also hosts his own podcasts ""Ari Shaffir's Skeptic Tank"" and ""Punch Drunk Sports""",ari-shaffir-4,http://traffic.libsyn.com/joeroganexp/p515.mp3,,eVEBupwo51I
podcast,515,2014-06-27,Ari Shaffir,"#515. (Part 2) Ari Shaffir is a stand-up comedian and also hosts his own podcasts ""Ari Shaffir's Skeptic Tank"" and ""Punch Drunk Sports""",ari-shaffir-5,http://traffic.libsyn.com/joeroganexp/p515a.mp3,,AkZwLQCBVeo
podcast,701,2015-09-28,Honey Honey (Part 1),"#701. Honey Honey is a band, featuring members Suzanne Santo and Ben Jaffe, from Los Angeles, CA. They released a new album this summer called ""3"" and are currently touring all over -- http://honeyhoneyband.com/events",honey-honey-part-1,http://traffic.libsyn.com/joeroganexp/p701.mp3,,BGBN_97XzkY
podcast,701,2015-09-28,Honey Honey (Part 2),"#701. Honey Honey is a band, featuring members Suzanne Santo and Ben Jaffe, from Los Angeles, CA. They released a new album this summer called ""3"" and are currently touring all over -- http://honeyhoneyband.com/events",honey-honey-part-2,http://traffic.libsyn.com/joeroganexp/p701b.mp3,,yfYYhJaQTmU
podcast,706,2015-10-09,Brendan Schaub & Fight Companion ? (Part 1),"#706. Brendan Schaub is a mixed martial artist and also a former college & pro football player. He also hosts a podcast with Bryan Callen called ""The Fighter & The Kid"" available on iTunes.\r\n\r\nJoe & Brendan also watch fights that take place on October 9, 2015.",brendan-schaub-fight-companion-part-1,http://traffic.libsyn.com/joeroganexp/p706a.mp3,,YfnQWBExTYg
podcast,706,2015-10-09,Brendan Schaub & Fight Companion ? (Part 2),"#706. Brendan Schaub is a mixed martial artist and also a former college & pro football player. He also hosts a podcast with Bryan Callen called ""The Fighter & The Kid"" available on iTunes.\r\n\r\nJoe & Brendan also watch fights that take place on October 9, 2015.",brendan-schaub-fight-companion-part-2,http://traffic.libsyn.com/joeroganexp/p706b.mp3,,vA892WDF4xk


In [28]:
# mma0039.mp3:       empty
# mma0043.mp3:       empty
# mma0094.mp3:       empty

# These ones all return 404's. However, they're all on YouTube.
# Strangely, I can see the original video ID for mma43 is now private,
# but it was reposted by the official channel as OYSk66YiFwQ

In [29]:
df.loc[('mma',[39,43,94]),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,date,guests,description,slug,mp3,yt,ytid
type,episode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mma,39,2018-08-29,JRE MMA Show #39 with Donald “Cowboy” Cerrone,"#039. Joe sits down with UFC fighter Donald ""Cowboy"" Cerrone.",jre-mma-show-39-with-donald-cowboy-cerrone,http://traffic.libsyn.com/joeroganexp/mmashow39a.mp3,,Yvi7y10yNyg
mma,43,2018-10-02,JRE MMA Show #43 with Brendan Schaub,#43. Joe is joined by Brendan Schaub to preview UFC #229 and some more future fights.,jre-mma-show-43-with-brendan-schaub,http://traffic.libsyn.com/joeroganexp/mmashow43a.mp3,,BoVXS7e0NRw
mma,94,2020-04-10,JRE MMA Show #94 with Brendan Schaub,"#094. Joe sits down with with Brendan Schaub to discuss the ""upcoming"" fights.",jre-mma-show-94-with-brendan-schaub,http://traffic.libsyn.com/joeroganexp/mmashow94.mp3,,cF7_AaY9tCg


In [30]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,guests,description,slug,mp3,yt,ytid
type,episode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
podcast,1,2009-12-24,Brian Redban,,podcast-1,http://traffic.libsyn.com/joeroganexp/joecast1.mp3,,ZWBCnvOuXK8
podcast,2,2009-12-29,Brian Redban,,podcast-2-2,http://traffic.libsyn.com/joeroganexp/joecast2.mp3,,Psu4JK_M7V8
podcast,3,2010-01-06,Ari Shaffir,,podcast-3,http://traffic.libsyn.com/joeroganexp/joecast3.mp3,,tJyzTzXBM8I
podcast,4,2010-01-13,Brian Redban,,podcast-4,http://traffic.libsyn.com/joeroganexp/joecast4.mp3,,eYVq53f1aZI
podcast,5,2010-01-21,"Ari Shaffir, John Heffron",,podcast-5,http://traffic.libsyn.com/joeroganexp/joecast5.mp3,,r8bPaqxVj94
podcast,6,2010-01-28,Brian Redban,,podcast-6,http://traffic.libsyn.com/joeroganexp/joecast6.mp3,,8udgf54TftQ
podcast,7,2010-02-03,Brian Redban,,podcast-7,http://traffic.libsyn.com/joeroganexp/joecast7.mp3,,M-CYCpjn_vE
podcast,8,2010-02-10,Tom Segura,,podcast-8,http://traffic.libsyn.com/joeroganexp/joecast8.mp3,,
podcast,9,2010-02-24,Brian Redban,,podcast-9,http://traffic.libsyn.com/joeroganexp/p3.mp3,,blZn38CF_ow
podcast,10,2010-03-03,Joe Rogan,,podcast-10,http://traffic.libsyn.com/joeroganexp/joecast10.mp3,,


In [None]:
# Reading it back in later.
import pandas as pd
df = pd.read_csv('./rogan-scrape.csv')