In [None]:
# Can you tell between podcast
# Bring in multiple podcasts -> label -> train -> test
# Logistic Regression may be best
# Also check out binary models
# Python library to take in audio and convert to text
# Can you classify ads in audio
# Check out LSTM for NLP
# https://www.kaggle.com/competitions/goodreads-books-reviews-290312/data

# Look for breaks in the data (datetime)
# See if duration is correlated to the time diffs
# Look for ratings
# Principal Component analysis
# Principal Coordinate analysis
# Factor analysis
# Try different similarity matrix
# Do something other than fuzzywuzzy
# Try NLTK name finder
# Concat dfs with diff column names
# Dendrograms
# Other cluster vizs
# Possibly learn/use time series analysis

In [3]:
from IPython.display import clear_output

In [4]:
!pip3 install -r requirements.txt
clear_output()

In [5]:
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import statsmodels

In [39]:
import multiprocessing
num_cores = multiprocessing.cpu_count()
from joblib import Parallel, delayed

In [143]:
class Parser:
    """ Parses out rss feeds"""

    def __init__(self, start_date='2022-10-01', end_date='2022-11-01'):
        
        self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
        self.end_date = datetime.strptime(end_date, '%Y-%m-%d')

    def retrieve_rss_feed(self, url):
        xml_data = feedparser.parse(url)
        return pd.DataFrame(xml_data.entries)
            

In [245]:
class AdvancedParser:
    """ Parses out rss feeds"""

    def __init__(self, start_date='2022-10-01', end_date='2022-11-01', topic_urls_dict = {}, col_names=[]):
        
        self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
        self.end_date = datetime.strptime(end_date, '%Y-%m-%d')
        self.topic_urls_dict = topic_urls_dict
        self.all_dfs = []
        self.col_names = col_names
        
        if not self.col_names:
        # These are the common columns I've seen in podcast rss feeds
            self.col_names = ['published_parsed',
                                'id',
                                'links',
                                'summary',
                                'title',
                                'summary_detail',
                                'title_detail',
                                'published',
                                'guidislink']

    def _retrieve_rss_feed(self, url, topic=''):
        xml_data = feedparser.parse(url)
        df = pd.DataFrame(xml_data.entries)
        df = self._ensure_columns(df, self.col_names)
        df['topic'] = topic
        self.all_dfs.append(df)

    @staticmethod
    def _ensure_columns(df, col_names):
        return df[col_names]

    def gather_data(self):
        assert len(self.topic_urls_dict)

        for topic, urls in self.topic_urls_dict.items():
            for url in urls:
                self._retrieve_rss_feed(url, topic)

        final_df = pd.concat(self.all_dfs)
        final_df.reset_index(drop=True, inplace=True)
        return final_df

In [246]:
topic_urls_dict = {'news' : ["http://rss.cnn.com/rss/cnn_topstories.rss",
                                "https://www.huffpost.com/section/front-page/feed?x=1",
                                "https://feeds.simplecast.com/54nAGcIl",
                                "https://feeds.feedburner.com/Monocle24TheGlobalist",
                                "https://www.theguardian.com/news/series/todayinfocus/podcast.xml"],
                'crime' : ["https://feeds.simplecast.com/qm_9xx0g",
                                "https://rss.art19.com/morbid-a-true-crime-podcast",
                                "https://rss.art19.com/erm-mfm",
                                "https://feeds.megaphone.fm/VMP7924981569",
                                "https://www.omnycontent.com/d/playlist/d83f52e4-2455-47f4-982e-ab790120b954/82e70870-d45e-4b4c-8e17-ab8600091b59/e20bfe1d-24c8-4809-b7b0-ab8600091b62/podcast.rss",
                                "https://feeds.megaphone.fm/darknetdiaries",
                                "https://feeds.simplecast.com/GdzgJRQH",
                                "https://feeds.simplecast.com/xl36XBC2",
                                "https://rss.art19.com/dr-death",
                                "https://podcastfeeds.nbcnews.com/HL4TzgYC"],
                'science' : ["https://feeds.simplecast.com/FO6kxYGj",
                                "http://feeds.feedburner.com/radiolab",
                                "https://media.rss.com/fm/feed.xml",
                                "https://omnycontent.com/d/playlist/e73c998e-6e60-432f-8610-ae210140c5b1/6EA152C0-9E3A-45DE-8672-AE2F0056B113/D8936746-9E22-4DBA-B762-AE2F0056B126/podcast.rss",
                                "https://www.thenakedscientists.com/naked_scientists_podcast.xml"],
                'sport' : ["https://media.rss.com/progressivepodcast/feed.xml",
                                "https://media.rss.com/wholeninesports/feed.xml",
                                "https://media.rss.com/moodysportswithdanandzach/feed.xml",
                                "https://media.rss.com/thejosephvorepodcast/feed.xml",
                                "https://media.rss.com/wholeshabang/feed.xml",
                                "https://mcsorleys.barstoolsports.com/feed/pardon-my-take",
                                "https://feeds.megaphone.fm/ESP2298543312",
                                "https://feeds.megaphone.fm/ESP3500611186",
                                "https://feeds.megaphone.fm/ESP3025643506"]}

a_ps = AdvancedParser('2022-10-01', '2022-11-01', topic_urls_dict)

ps = Parser('2022-10-01', '2022-11-01')

In [247]:
all_pc_df = a_ps.gather_data()

In [250]:
# This is saving off the data
# all_pc_df.to_pickle('all_podcast_info.pkl')

# Ingesting the pkl file
all_pc_df = pd.read_pickle('all_podcast_info.pkl')

In [251]:
all_pc_df.shape

(10531, 10)

In [262]:
# Getting rid of rows where summary is not populated
all_pc_df.dropna(inplace=True)
all_pc_df.reset_index(inplace=True, drop=True)

In [263]:
all_pc_df.shape

(10497, 10)

In [264]:
all_pc_df.drop_duplicates(subset='summary', inplace=True)

In [265]:
#Checking to see if my data is evenly spread
all_pc_df.groupby('topic').count()

Unnamed: 0_level_0,published_parsed,id,links,summary,title,summary_detail,title_detail,published,guidislink
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
crime,2417,2417,2417,2417,2417,2417,2417,2417,2417
news,2162,2162,2162,2162,2162,2162,2162,2162,2162
science,3126,3126,3126,3126,3126,3126,3126,3126,3126
sport,2546,2546,2546,2546,2546,2546,2546,2546,2546


In [266]:
# Since the amount for each topic is not the same, I want to balance this to not influence the model. 
all_topics_list = []
for topic, count in all_pc_df.topic.value_counts().to_dict().items():
    all_topics_list.append(all_pc_df[all_pc_df['topic'] == topic])

# Grabbing the small amount to use as our sample amount for underfitting
smallest_topic_amount = min(all_pc_df.topic.value_counts().to_dict().values())

# This is where we will underfit our data
temp_dfs = []
for temp_df in all_topics_list:
    temp_dfs.append(temp_df.sample(smallest_topic_amount))

fitted_df = pd.concat(temp_dfs)

In [267]:
#Checking to see if my data is evenly spread
fitted_df.groupby('topic').count()

Unnamed: 0_level_0,published_parsed,id,links,summary,title,summary_detail,title_detail,published,guidislink
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
crime,2162,2162,2162,2162,2162,2162,2162,2162,2162
news,2162,2162,2162,2162,2162,2162,2162,2162,2162
science,2162,2162,2162,2162,2162,2162,2162,2162,2162
sport,2162,2162,2162,2162,2162,2162,2162,2162,2162


In [196]:
# CNN rss
cnn_df = ps.retrieve_rss_feed("http://rss.cnn.com/rss/cnn_topstories.rss")

#Huffington Post rss
hf_df = ps.retrieve_rss_feed("https://www.huffpost.com/section/front-page/feed?x=1")

In [142]:
cnn_df

In [72]:
list(set(hf_df.columns) & set(cj_df.columns) & set(cnn_df.columns) & set(morbid_df.columns))

['published_parsed',
 'id',
 'links',
 'summary',
 'title',
 'summary_detail',
 'title_detail',
 'published',
 'guidislink']

In [9]:
#Crime Junkie rss
cj_df = ps.retrieve_rss_feed('https://feeds.simplecast.com/qm_9xx0g')

In [64]:
cj_df.head()

Unnamed: 0,id,guidislink,title,title_detail,summary,summary_detail,published,published_parsed,authors,author,...,link,content,itunes_title,itunes_duration,subtitle,subtitle_detail,itunes_explicit,itunes_episodetype,image,CJ_text_to_compare
0,ba6640bb-9ec4-40b8-828e-f2d16cdaf2e6,False,MURDERED: The Short Family,"{'type': 'text/plain', 'language': None, 'base...",When Michael and Mary Short are found murdered...,"{'type': 'text/plain', 'language': None, 'base...","Mon, 14 Nov 2022 08:05:00 +0000","(2022, 11, 14, 8, 5, 0, 0, 318, 0)","[{'name': 'audiochuck', 'email': 'content@audi...",audiochuck,...,https://crimejunkiepodcast.com/,"[{'type': 'text/html', 'language': None, 'base...",MURDERED: The Short Family,00:26:52,When Michael and Mary Short are found murdered...,"{'type': 'text/plain', 'language': None, 'base...",,full,,The Short Family
1,75e27b81-7e10-49e6-ba5b-beff377bfbb7,False,MISSING: Arianna Fitts,"{'type': 'text/plain', 'language': None, 'base...",When Nicole Fitts is found murdered in a park ...,"{'type': 'text/plain', 'language': None, 'base...","Mon, 14 Nov 2022 08:00:00 +0000","(2022, 11, 14, 8, 0, 0, 0, 318, 0)","[{'name': 'audiochuck', 'email': 'content@audi...",audiochuck,...,https://crimejunkiepodcast.com/,"[{'type': 'text/html', 'language': None, 'base...",MISSING: Arianna Fitts,00:21:50,When Nicole Fitts is found murdered in a park ...,"{'type': 'text/plain', 'language': None, 'base...",,full,,Arianna Fitts
2,8cae95f2-3d82-461b-aaff-106b916ebce7,False,WANTED: Justice for Hassani Campbell and Tiann...,"{'type': 'text/plain', 'language': None, 'base...",Five-year-old Hassani Campbell disappears with...,"{'type': 'text/plain', 'language': None, 'base...","Mon, 7 Nov 2022 08:00:00 +0000","(2022, 11, 7, 8, 0, 0, 0, 311, 0)","[{'name': 'audiochuck', 'email': 'content@audi...",audiochuck,...,https://crimejunkiepodcast.com/,"[{'type': 'text/html', 'language': None, 'base...",WANTED: Justice for Hassani Campbell and Tiann...,00:34:23,Five-year-old Hassani Campbell disappears with...,"{'type': 'text/plain', 'language': None, 'base...",,full,,Justice for Hassani Campbell and Tianna Kirchner
3,a549f4a1-f363-4d8c-8df9-0730fa8a46fc,False,CAPTURED: Killer on the High Bridge,"{'type': 'text/plain', 'language': None, 'base...","Today, October 31, 2022, the Indiana State Pol...","{'type': 'text/plain', 'language': None, 'base...","Mon, 31 Oct 2022 19:55:00 +0000","(2022, 10, 31, 19, 55, 0, 0, 304, 0)","[{'name': 'audiochuck', 'email': 'content@audi...",audiochuck,...,https://crimejunkiepodcast.com/,"[{'type': 'text/html', 'language': None, 'base...",CAPTURED: Killer on the High Bridge,00:08:13,"Today, October 31, 2022, the Indiana State Pol...","{'type': 'text/plain', 'language': None, 'base...",,full,,Killer on the High Bridge
4,feec70d7-f20d-46a9-a223-d9cc497530fd,False,MISSING: Celina Mays,"{'type': 'text/plain', 'language': None, 'base...",Celina Mays was 12 years old when she vanished...,"{'type': 'text/plain', 'language': None, 'base...","Mon, 31 Oct 2022 07:00:00 +0000","(2022, 10, 31, 7, 0, 0, 0, 304, 0)","[{'name': 'audiochuck', 'email': 'content@audi...",audiochuck,...,https://crimejunkiepodcast.com/,"[{'type': 'text/html', 'language': None, 'base...",MISSING: Celina Mays,00:51:30,Celina Mays was 12 years old when she vanished...,"{'type': 'text/plain', 'language': None, 'base...",,full,,Celina Mays


In [12]:
# Morbid rss
morbid_df = ps.retrieve_rss_feed('https://rss.art19.com/morbid-a-true-crime-podcast')

In [63]:
morbid_df.tags[0]

[{'term': 'True Crime', 'scheme': 'http://www.itunes.com/', 'label': None}]

In [14]:
# Cleaning up the title to obtain only a person's name
cj_df['CJ_text_to_compare'] = cj_df.title.str.split(':')
cj_df['CJ_text_to_compare'] = cj_df['CJ_text_to_compare'].apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cj_df['CJ_text_to_compare'] = cj_df['CJ_text_to_compare'].str.replace('[0-9]:;",-', '', regex=True)
cj_df['CJ_text_to_compare'] = cj_df['CJ_text_to_compare'].str.replace('Part [0-9]', '', regex=True)

morbid_df['Morbid_text_to_compare'] = morbid_df.title.str.split(':')
morbid_df['Morbid_text_to_compare'] = morbid_df['Morbid_text_to_compare'].apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
morbid_df['Morbid_text_to_compare'] = morbid_df['Morbid_text_to_compare'].str.replace('[0-9]:;",-', '', regex=True)
morbid_df['Morbid_text_to_compare'] = morbid_df['Morbid_text_to_compare'].str.replace('Part [0-9]', '', regex=True)

In [15]:
# Getting the text to compare into lists
cj_titles = cj_df.CJ_text_to_compare.tolist()
morbid_titles = morbid_df.Morbid_text_to_compare.tolist()

In [41]:
def title_comparison(title1, titles_to_compare):
    return (title1, process.extract(title1, titles_to_compare, limit=3, scorer=fuzz.WRatio))

In [None]:
# Using joblib
potential_matches = Parallel(n_jobs = num_cores)(delayed(title_comparison)(title1, morbid_titles) for title1 in cj_titles)

# using for loop
# potential_matches = []
# for i in cj_titles:
#     potential_matches.append((i, process.extract(i, morbid_titles, limit=3, scorer=fuzz.WRatio)))

# First time comparing 285 x 449 titles took 42 seconds

In [17]:
#Going through the potential matches to set a threshold for which matches to keep
good_matches = []
threshold = 88
for pm in potential_matches:
    for t_matches in pm[1]:

        if t_matches[1] > threshold:
            good_matches.append({'CJ_Title' : pm[0],
                                'Morbid_Title' : t_matches[0],
                                'Score' : t_matches[1]
                                }   
            )
matches_df = pd.DataFrame(good_matches)     

In [18]:
matches_df.head()

Unnamed: 0,CJ_Title,Morbid_Title,Score
0,Polly Klaas,The Abduction and Murder of Polly Klaas,90
1,Tara Calico,Tara Calico,100
2,Phoebe Handsjuk,The Mysterious Death of Phoebe Handsjuk 2,90
3,Phoebe Handsjuk,The Mysterious Death Of Phoebe Handsjuk 1,90
4,Brittanee Drexel,**AD FREE** The Mysterious Disappearance of Br...,90


In [45]:
#merging the morbid and cj dataframes with the matches_df
joined_df = matches_df.merge(cj_df[['CJ_text_to_compare', 'published']], how='left', left_on='CJ_Title', right_on='CJ_text_to_compare')
joined_df = joined_df.merge(morbid_df[['Morbid_text_to_compare', 'published']], how='left', left_on='Morbid_Title', right_on='Morbid_text_to_compare')

In [44]:
joined_df.head()

Unnamed: 0,CJ_Title,Morbid_Title,Score,CJ_text_to_compare,CJ_Date,Morbid_text_to_compare,Morbid_Date,CJ_First,Time_Diff,Abs_Time_Diff,First_Date
0,Polly Klaas,The Abduction and Murder of Polly Klaas,90,Polly Klaas,2022-05-09 07:00:00+00:00,The Abduction and Murder of Polly Klaas,2020-01-20 05:23:36+00:00,False,840 days 01:36:24,840,2020-01-20 05:23:36+00:00
1,Tara Calico,Tara Calico,100,Tara Calico,2020-09-28 07:00:00+00:00,Tara Calico,2021-07-29 20:18:00+00:00,True,-305 days +10:42:00,305,2020-09-28 07:00:00+00:00
2,Phoebe Handsjuk,The Mysterious Death of Phoebe Handsjuk 2,90,Phoebe Handsjuk,2020-07-27 07:00:00+00:00,The Mysterious Death of Phoebe Handsjuk 2,2019-06-05 03:55:38+00:00,False,418 days 03:04:22,418,2019-06-05 03:55:38+00:00
3,Phoebe Handsjuk,The Mysterious Death Of Phoebe Handsjuk 1,90,Phoebe Handsjuk,2020-07-27 07:00:00+00:00,The Mysterious Death Of Phoebe Handsjuk 1,2019-06-05 03:55:38+00:00,False,418 days 03:04:22,418,2019-06-05 03:55:38+00:00
4,Brittanee Drexel,**AD FREE** The Mysterious Disappearance of Br...,90,Brittanee Drexel,2020-01-13 08:00:00+00:00,**AD FREE** The Mysterious Disappearance of Br...,2021-01-23 16:00:01+00:00,True,-377 days +15:59:59,377,2020-01-13 08:00:00+00:00


In [46]:
# Had to rename the date columns
joined_df.rename(columns={
    'published_x' : 'CJ_Date',
    'published_y' : 'Morbid_Date'
    }, inplace=True)

In [47]:
joined_df['CJ_Date'] = pd.to_datetime(joined_df['CJ_Date'])
joined_df['Morbid_Date'] = pd.to_datetime(joined_df['Morbid_Date'])

In [48]:
# Building the scatterplot
# Need to research a better way perhaps
fig = go.Figure()
fig.add_trace(go.Scatter(mode='markers',
            x=joined_df["CJ_Date"], 
            y=joined_df["CJ_Title"], 
            marker=dict(
            color='Blue'
            ),
            name='Crime Junkie'

    )
)
fig.add_trace(go.Scatter(mode='markers',
            x=joined_df["Morbid_Date"], 
            y=joined_df["CJ_Title"], 
            marker=dict(
            color='Red'
            ),
            name='Morbid'
    )
)
fig.show()

In [54]:
# Had to overwrite the dates to set the first date for each topic as there are multiple episodes per title
joined_df[['CJ_Date', 'Morbid_Date']] = joined_df.groupby('CJ_Title')[['CJ_Date', 'Morbid_Date']].transform('min')

In [55]:
def get_first_date(row):
    if row['CJ_Date'] > row['Morbid_Date']:
        return row['Morbid_Date']
    else:
        return row['CJ_Date']

In [56]:
# Setting a flag for which podcast released first and the time difference
# If you do a comparison without doing a group by this will not work
joined_df['CJ_First'] = joined_df['CJ_Date'] < joined_df['Morbid_Date']
joined_df['First_Date'] = joined_df.apply(get_first_date, axis=1)
joined_df['Time_Diff'] = joined_df['CJ_Date'] - joined_df['Morbid_Date']
joined_df['Abs_Time_Diff'] = joined_df['Time_Diff'].apply(lambda x: abs(x.days))

In [57]:
# Showing how many times CJ posted the topic first
fig1 = px.bar(joined_df, x='CJ_First')
fig1.show()

In [58]:
# Showing the time difference per episode between the first post and the second post
time_chart = px.bar(joined_df, 
                    x='CJ_Title',
                    y='Abs_Time_Diff',
                    color='CJ_First')
time_chart.show()

In [59]:
# realized from the above chart that some episodes have multiple parts
# Wanted to only see the shortes window between posts
# Sorted by Abs_Time_Diff with ascending = True to keep the smallest values when the drop_dups is applied
time_chart = px.bar(joined_df.sort_values(by='Abs_Time_Diff').drop_duplicates(subset=['CJ_Title']), 
                    x='CJ_Title',
                    y='Abs_Time_Diff',
                    color='CJ_First')
time_chart.show()

In [53]:
# Here I realized my methodolgy was off for the initial time comparison
# I'm going back to do a group_by to change the comparison
joined_df[joined_df['CJ_Title']=='Israel Keyes']

Unnamed: 0,CJ_Title,Morbid_Title,Score,CJ_text_to_compare,CJ_Date,Morbid_text_to_compare,Morbid_Date,CJ_First,Time_Diff,Abs_Time_Diff
10,Israel Keyes,Israel Keyes 2,95,Israel Keyes,2019-05-06 07:00:00+00:00,Israel Keyes 2,2019-05-07 04:04:11+00:00,True,-1 days +02:55:49,1
11,Israel Keyes,Israel Keyes 1,95,Israel Keyes,2019-05-06 07:00:00+00:00,Israel Keyes 1,2019-04-27 03:30:39+00:00,False,9 days 03:29:21,9


In [60]:
# trying to see the trend over time for each podcast
time_chart = px.scatter(joined_df.sort_values(by='Abs_Time_Diff').drop_duplicates(subset=['CJ_Title']), 
                    x='First_Date',
                    y='Abs_Time_Diff',
                    color='CJ_First',
                    trendline='ols')
time_chart.show()