In [1]:
from IPython.display import clear_output

In [2]:
!pip3 install -r requirements.txt
clear_output()

In [3]:
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
import statsmodels

In [4]:
import multiprocessing
num_cores = multiprocessing.cpu_count()
from joblib import Parallel, delayed

In [5]:
class AdvancedParser:
    """ Parses out rss feeds"""

    def __init__(self, start_date='2022-10-01', end_date='2022-11-01', topic_urls_dict = {}, col_names=[]):
        
        self.start_date = datetime.strptime(start_date, '%Y-%m-%d')
        self.end_date = datetime.strptime(end_date, '%Y-%m-%d')
        self.topic_urls_dict = topic_urls_dict
        self.all_dfs = []
        self.col_names = col_names
        
        if not self.col_names:
        # These are the common columns I've seen in podcast rss feeds
            self.col_names = ['published_parsed',
                                'id',
                                'links',
                                'summary',
                                'title',
                                'summary_detail',
                                'title_detail',
                                'published',
                                'guidislink']

    def _retrieve_rss_feed(self, url, topic=''):
        xml_data = feedparser.parse(url)
        df = pd.DataFrame(xml_data.entries)
        df = self._ensure_columns(df, self.col_names)
        df['topic'] = topic
        self.all_dfs.append(df)

    @staticmethod
    def _ensure_columns(df, col_names):
        return df[col_names]

    def gather_data(self):
        assert len(self.topic_urls_dict)

        for topic, urls in self.topic_urls_dict.items():
            for url in urls:
                self._retrieve_rss_feed(url, topic)

        final_df = pd.concat(self.all_dfs)
        final_df.reset_index(drop=True, inplace=True)
        return final_df

In [None]:
topic_urls_dict = {'news' : ["http://rss.cnn.com/rss/cnn_topstories.rss",
                                "https://www.huffpost.com/section/front-page/feed?x=1",
                                "https://feeds.simplecast.com/54nAGcIl",
                                "https://feeds.feedburner.com/Monocle24TheGlobalist",
                                "https://www.theguardian.com/news/series/todayinfocus/podcast.xml"],
                'crime' : ["https://feeds.simplecast.com/qm_9xx0g",
                                "https://rss.art19.com/morbid-a-true-crime-podcast",
                                "https://rss.art19.com/erm-mfm",
                                "https://feeds.megaphone.fm/VMP7924981569",
                                "https://www.omnycontent.com/d/playlist/d83f52e4-2455-47f4-982e-ab790120b954/82e70870-d45e-4b4c-8e17-ab8600091b59/e20bfe1d-24c8-4809-b7b0-ab8600091b62/podcast.rss",
                                "https://feeds.megaphone.fm/darknetdiaries",
                                "https://feeds.simplecast.com/GdzgJRQH",
                                "https://feeds.simplecast.com/xl36XBC2",
                                "https://rss.art19.com/dr-death",
                                "https://podcastfeeds.nbcnews.com/HL4TzgYC"],
                'science' : ["https://feeds.simplecast.com/FO6kxYGj",
                                "http://feeds.feedburner.com/radiolab",
                                "https://media.rss.com/fm/feed.xml",
                                "https://omnycontent.com/d/playlist/e73c998e-6e60-432f-8610-ae210140c5b1/6EA152C0-9E3A-45DE-8672-AE2F0056B113/D8936746-9E22-4DBA-B762-AE2F0056B126/podcast.rss",
                                "https://www.thenakedscientists.com/naked_scientists_podcast.xml"],
                'sport' : ["https://media.rss.com/progressivepodcast/feed.xml",
                                "https://media.rss.com/wholeninesports/feed.xml",
                                "https://media.rss.com/moodysportswithdanandzach/feed.xml",
                                "https://media.rss.com/thejosephvorepodcast/feed.xml",
                                "https://media.rss.com/wholeshabang/feed.xml",
                                "https://mcsorleys.barstoolsports.com/feed/pardon-my-take",
                                "https://feeds.megaphone.fm/ESP2298543312",
                                "https://feeds.megaphone.fm/ESP3500611186",
                                "https://feeds.megaphone.fm/ESP3025643506"]}

a_ps = AdvancedParser('2022-10-01', '2022-11-01', topic_urls_dict)

In [6]:
# all_pc_df = a_ps.gather_data()
# This is saving off the data
# all_pc_df.to_pickle('all_podcast_info.pkl')

# Ingesting the pkl file
all_pc_df = pd.read_pickle('all_podcast_info.pkl')

# Getting rid of rows where summary is not populated
all_pc_df.dropna(inplace=True)
all_pc_df.reset_index(inplace=True, drop=True)

In [7]:
#Checking to see if my data is evenly spread
all_pc_df.groupby('topic').count()

Unnamed: 0_level_0,published_parsed,id,links,summary,title,summary_detail,title_detail,published,guidislink
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
crime,2424,2424,2424,2424,2424,2424,2424,2424,2424
news,2169,2169,2169,2169,2169,2169,2169,2169,2169
science,3357,3357,3357,3357,3357,3357,3357,3357,3357
sport,2553,2553,2553,2553,2553,2553,2553,2553,2553


In [8]:
# Since the amount for each topic is not the same, I want to balance this to not influence the model. 
all_topics_list = []
for topic, count in all_pc_df.topic.value_counts().to_dict().items():
    all_topics_list.append(all_pc_df[all_pc_df['topic'] == topic])

# Grabbing the small amount to use as our sample amount for underfitting
smallest_topic_amount = min(all_pc_df.topic.value_counts().to_dict().values())

# This is where we will underfit our data
temp_dfs = []
for temp_df in all_topics_list:
    temp_dfs.append(temp_df.sample(smallest_topic_amount))

fitted_df = pd.concat(temp_dfs)

In [9]:
#Checking to see if my data is evenly spread
fitted_df.groupby('topic').count()

Unnamed: 0_level_0,published_parsed,id,links,summary,title,summary_detail,title_detail,published,guidislink
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
crime,2169,2169,2169,2169,2169,2169,2169,2169,2169
news,2169,2169,2169,2169,2169,2169,2169,2169,2169
science,2169,2169,2169,2169,2169,2169,2169,2169,2169
sport,2169,2169,2169,2169,2169,2169,2169,2169,2169
