> library dependencies

+ itunes API searcher
    + ```$ pip install pyitunes```

In [1]:
import itunes
import pandas as pd 
import matplotlib.pyplot as plt
import bs4 as bs
import urllib.request

### this is a wrapper for the [iTunes Search Api]()

In [2]:
# for example
item = itunes.search (query='beatles')
print (item)

[<Song>: Let It Be, <Song>: Miss You, <Song>: Hey Jude, <Song>: Here Comes the Sun, <Song>: In My Life, <Song>: Come Together, <Song>: I Want to Hold Your Hand, <Song>: Blackbird, <Song>: The Long and Winding Road, <Song>: Ob-La-Di, Ob-La-Da, <Song>: Can't Buy Me Love, <Song>: A Hard Day's Night, <Audiobook>: George Harrison, <Song>: Yesterday, <Song>: Love Me Do, <Song>: With a Little Help From My Friends, <Song>: When I'm Sixty-Four, <Song>: All You Need Is Love, <Song>: Yesterday, <Song>: Get Back, <Song>: Let It Be, <Song>: Hey Jude, <Song>: While My Guitar Gently Weeps, <Song>: Lucy In the Sky with Diamonds, <Song>: Yellow Submarine, <Audiobook>: The Meaning of Life in 5 Easy Lessons (To the Best of Our Knowledge Series), <Audiobook>: John Lennon, <Song>: She Loves You, <Song>: Eleanor Rigby, <Song>: Help!, <Song>: Come Together, <Song>: Eight Days a Week, <Song>: Penny Lane, <Song>: We Can Work It Out, <Song>: Something, <Audiobook>: Paul McCartney, <Song>: Hello, Goodbye, <Song>

### but searches for podcasts instead of music albums

### Set the query term and run the notebook
### ⇩


In [22]:
queryterm = 'cystic fibrosis'
# queryterm = 'infectious disease'
# queryterm = 'genetic'
# queryterm = 'covid19'
# queryterm = 'foreclosure'
podsearch = itunes.search(query=queryterm, media='podcast')

In [23]:
for p in podsearch:
    print (p)

<Podcast>: Breathe In: A Cystic Fibrosis Podcast
<Podcast>: Breaking Through
<Podcast>: Cystic Fibrosis Review
<Podcast>: Cystic Fibrosis Podcast
<Podcast>: Cystic Fibrosis
<Podcast>: Cystic Fibrosis in Focus
<Podcast>: Living With Cystic Fibrosis
<Podcast>: My Journey With Cystic Fibrosis
<Podcast>: AHEAD OF THE CURVE: Cystic Fibrosis
<Podcast>: Just Livin Life with Cystic Fibrosis
<Podcast>: Jerry Unplugged: A Cystic Fibrosis Podcast
<Podcast>: Gene Therapy with Cystic Fibrosis


In [24]:
# let's looks at what gets returned for each item in that list of Podcast search results
print (vars(podsearch[0]))

{'id': 1299822342, 'name': 'Breathe In: A Cystic Fibrosis Podcast', 'url': 'https://podcasts.apple.com/us/podcast/breathe-in-a-cystic-fibrosis-podcast/id1299822342?uo=4', '_release_date': None, 'artwork': {'30': 'https://is3-ssl.mzstatic.com/image/thumb/Podcasts113/v4/76/fc/f9/76fcf922-0fb2-9465-17ae-a69c52d05280/mza_8694591701698332440.jpg/30x30bb.jpg', '60': 'https://is3-ssl.mzstatic.com/image/thumb/Podcasts113/v4/76/fc/f9/76fcf922-0fb2-9465-17ae-a69c52d05280/mza_8694591701698332440.jpg/60x60bb.jpg', '600': 'https://is3-ssl.mzstatic.com/image/thumb/Podcasts113/v4/76/fc/f9/76fcf922-0fb2-9465-17ae-a69c52d05280/mza_8694591701698332440.jpg/600x600bb.jpg'}, 'json': {'wrapperType': 'track', 'kind': 'podcast', 'collectionId': 1299822342, 'trackId': 1299822342, 'artistName': 'Gunnar Esiason and the Salty Cysters', 'collectionName': 'Breathe In: A Cystic Fibrosis Podcast', 'trackName': 'Breathe In: A Cystic Fibrosis Podcast', 'collectionCensoredName': 'Breathe In: A Cystic Fibrosis Podcast', 

### ⇪ A little wierd, looks like most of what we want is packed into
### a "JSON" attribute, so...there it is.

- - -

# So let's lookup some stuff about the podcast in question

+ no. of individual posts / episodes per year since publishing
+ length posts (minutes)
+ rating per channel +/or track, if available

### first for # posts / episodes / media URL

In [47]:
print (f'{len(podsearch)} channels\n')
for channel in podsearch:
    print (f'{channel.json["trackCount"]} podcasts in "{channel.json["collectionName"]}" (with id = {channel.id})')
    print (f'The last post in this channel was {channel.json["releaseDate"]}, and iTunes page lising all podcast episodes:')
    print (f'{channel.json["collectionViewUrl"]}\n')

12 channels

99 podcasts in "Breathe In: A Cystic Fibrosis Podcast" (with id = 1299822342)
The last post in this channel was 2019-12-05T11:00:00Z, and iTunes page lising all podcast episodes:
https://podcasts.apple.com/us/podcast/breathe-in-a-cystic-fibrosis-podcast/id1299822342?uo=4

22 podcasts in "Breaking Through" (with id = 1404167309)
The last post in this channel was 2020-03-25T19:04:00Z, and iTunes page lising all podcast episodes:
https://podcasts.apple.com/us/podcast/breaking-through/id1404167309?uo=4

47 podcasts in "Cystic Fibrosis Review" (with id = 300355725)
The last post in this channel was 2017-10-26T13:00:00Z, and iTunes page lising all podcast episodes:
https://podcasts.apple.com/us/podcast/cystic-fibrosis-review/id300355725?uo=4

208 podcasts in "Cystic Fibrosis Podcast" (with id = 1425889980)
The last post in this channel was 2019-07-25T20:37:00Z, and iTunes page lising all podcast episodes:
https://podcasts.apple.com/us/podcast/cystic-fibrosis-podcast/id1425889980

In [46]:
# that iTunes page only lists the first few episodes
# so we're going to get the info from the podcast publisher's feed (not Apple)
# definition to make sure podcasts list a feedUrl

def checkKeys(podcast):
        # feedURL is not consistent, check if present
        if ('feedUrl' in podcast.json):
            # write to an array of feeds
            return channel.json["feedUrl"]
        else:
            pass

In [27]:
# create array of VALID podcast feeds (published by podcaster)
feedarray = []
for channel in podsearch:
    p = checkKeys(channel)
    if p != None:
        feedarray.append(p)
print (feedarray)

['http://feeds.soundcloud.com/users/soundcloud:users:339623866/sounds.rss', 'https://feeds.blubrry.com/feeds/breaking_through.xml', 'http://feeds.feedburner.com/eCysticFibrosis', 'http://feeds.soundcloud.com/users/soundcloud:users:492929361/sounds.rss', 'https://anchor.fm/s/efa8fa8/podcast/rss', 'https://reachmd.com/rss/itunes/517/', 'https://feeds.simplecast.com/9p8q_0Xo', 'http://feeds.feedburner.com/MyJourneyWithCF', 'http://feeds.feedburner.com/AheadOfTheCurveCysticFibrosis', 'http://feeds.soundcloud.com/users/soundcloud:users:420099012/sounds.rss', 'http://feeds.soundcloud.com/users/soundcloud:users:465418551/sounds.rss', 'https://anchor.fm/s/1e17f228/podcast/rss']


In [33]:
print (f'{len(feedarray)} channels from this search list their feed')
print (f'⇪ of them lost {(len(podsearch)-len(feedarray))} by filtering for publisher feeds')

12 channels from this search list their feed
⇪ of them lost 0 by filtering for publisher feeds


In [60]:
# now let's get the dataframe fields from this 'dict'-like list
# what are the fields?
request = urllib.request.Request(feedarray[0],headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'})
source = urllib.request.urlopen(request)

# this is an XML feel
# inspection reveals that each podcast is encapsulated in <item> tags
soup = bs.BeautifulSoup(source,'lxml')
rows = soup.find_all('item')
print (f'This podcast channel has {len(rows)} episodes, and here is one episode from that feed:\n')
print (rows[0])

This podcast channel has 99 episodes, and here is one episode from that feed:

<item>
<guid ispermalink="false">tag:soundcloud,2010:tracks/717054508</guid>
<title>Breathe In #98 - The Legal Marriage Question</title>
<pubdate>Thu, 05 Dec 2019 11:00:03 +0000</pubdate>
<link/>https://soundcloud.com/user-996750830/breathe-in-98-the-legal-marriage-question
      <itunes:duration>00:34:01</itunes:duration>
<itunes:author>Gunnar Esiason and the Salty Cysters</itunes:author>
<itunes:explicit>no</itunes:explicit>
<itunes:summary>This week on Breathe In, Tiffany is on her own again while Gunnar battles his way through final exams. Tiffany is joined by a new voice on the podcast, Holly Seay, 24, from Georgia, a newlywed with CF. Holly takes us through her life with CF and how she met her now husband. Prior to their wedding, Holly and her now husband never lived together. Holly talks about the adjustment period and how her husband has acclimated to the CF life. She gives advice to those that are w

In [67]:
def checkEpisodes(url):
    #since it's a request, let's add some error handling
    try:
        request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'})
        source = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        print (f'Failed {e}: {url}')
        return None
    try:
        soup = bs.BeautifulSoup(source,'lxml')
        episodes = soup.find_all('item')
    except AttributeError as e:
        return None
    return episodes

In [84]:
for x in feedarray:
    print (f'Working on {x}:')
    p = checkEpisodes(x)
    print (f'{len(p)}\n')

Working on http://feeds.soundcloud.com/users/soundcloud:users:339623866/sounds.rss:
99

Working on https://feeds.blubrry.com/feeds/breaking_through.xml:
22

Working on http://feeds.feedburner.com/eCysticFibrosis:
47

Working on http://feeds.soundcloud.com/users/soundcloud:users:492929361/sounds.rss:
208

Working on https://anchor.fm/s/efa8fa8/podcast/rss:
1

Working on https://reachmd.com/rss/itunes/517/:
4

Working on https://feeds.simplecast.com/9p8q_0Xo:
14

Working on http://feeds.feedburner.com/MyJourneyWithCF:
15

Working on http://feeds.feedburner.com/AheadOfTheCurveCysticFibrosis:
27

Working on http://feeds.soundcloud.com/users/soundcloud:users:420099012/sounds.rss:
4

Working on http://feeds.soundcloud.com/users/soundcloud:users:465418551/sounds.rss:
2

Working on https://anchor.fm/s/1e17f228/podcast/rss:
1



In [110]:
"""
        if (len(rows) > 0):
            print (f'This podcast channel has {len(rows)} episodes')
            for r in rows:
                title = r.find('itunes:title')
                print (title.text)
                pubdate = r.find('pubdate')
                print (pubdate.text)
                duration = r.find('itunes:duration')
                print (duration.text+'\n')
                
title = []
pubdate = []
description = []
duration = []
for r in rows:
    title = r.find('title')
    print (title.text)
    pubdate = r.find('pubdate')
    print (pubdate.text+ '\n')
   description = r.find('itunes:summary')
   print (description.text+ '\n')
    duration = r.find('itunes:duration')
    print (duration.text)
"""

"\ntitle = []\npubdate = []\ndescription = []\nduration = []\nfor r in rows:\n    title = r.find('title')\n    print (title.text)\n    pubdate = r.find('pubdate')\n    print (pubdate.text+ '\n')\n   description = r.find('itunes:summary')\n   print (description.text+ '\n')\n    duration = r.find('itunes:duration')\n    print (duration.text)\n"