In [7]:
import pandas as pd

episodes_data = [pd.read_csv(f'episodes_data_epoch_{i}.csv') for i in range(1,6)]
episodes_data.append(pd.read_csv(f'episodes_data_epoch_3_remain.csv'))
episodes_data = pd.concat(episodes_data, ignore_index=True)
episodes_data.drop_duplicates(subset='episode_id', keep='first')
episodes_data.dropna(inplace=True)
episodes_data.reset_index(drop=True, inplace=True)

In [9]:
episodes_data.to_csv('episodes_data.csv', index=False)

In [8]:
episodes_data

Unnamed: 0,episode_id,episode_name,episode_description,show_id,show_category
0,0347KaI75ADlF0HLYGKMhR,"Fashion Tier List (Supreme, Rick Owens, etc.)",Max Bratter and Allen Underwood rank some of t...,3ozYIoFrKrBkn6XzlaZZe8,Arts & Entertainment
1,4n9xXUvxepubHp9OuvFmX5,Is Ken Carson the true successor to Playboi Ca...,Max Bratter and Allen Underwood spend this epi...,3ozYIoFrKrBkn6XzlaZZe8,Arts & Entertainment
2,7LLAkFrGLPGM4hpwYOeGq3,A&E is For All the Dogs,Max Bratter and Allen Underwood dedicate this ...,3ozYIoFrKrBkn6XzlaZZe8,Arts & Entertainment
3,4uBmFBXNFQmo8eG5OqIwhD,Is Lil Tecca falling off? Which new songs and ...,In the first episode of The Spectator’s Arts &...,3ozYIoFrKrBkn6XzlaZZe8,Arts & Entertainment
4,4sopLohQdirkxwGoQuQm0T,Is Dax Shepard Bert's New Best Friend? | 2 Be...,Don’t forget to check out Dax Shepard’s own po...,1PgDUTgeyu3FOzK1FcBoqa,Arts & Entertainment
...,...,...,...,...,...
514585,39Ayo7uW568hhEM03QQGWe,Episode 45: Tom Brown Part 3,"Tom Brown was a popular, well liked senior at ...",7EvBmo358g2gFkvJKqY7KB,True Crime
514586,20HU5OYUYGjGQU1qkECUf1,Episode 46: Alex Van Dalsen,Alex Van Dalsen left his house on foot at 11:4...,7EvBmo358g2gFkvJKqY7KB,True Crime
514587,5XwDzbYPp3G2ZJsU1qxmgE,Episode 47: Brian Deneke,"Brian Deneke was an artist, musician and leade...",7EvBmo358g2gFkvJKqY7KB,True Crime
514588,6Gszyqw77BOPkyvfZOiXDJ,Episode 48: Theodore Robert Wright III,"Theodore Robert Wright III, aka, T.R. was a we...",7EvBmo358g2gFkvJKqY7KB,True Crime


In [10]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words += ['show', 'shows', 'podcast', 'podcasts', 'episode', 'episodes']

def preprocess_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)
    tokens = word_tokenize(text.lower())
    meaningful_words = [word for word in tokens if word.isalnum() and word not in stop_words]
    return meaningful_words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zxr01\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zxr01\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
import numpy as np
from scipy.sparse import csr_matrix, save_npz

word_counts = []

for description in episodes_data['episode_description']:
    words = preprocess_text(description)
    word_counts.append(Counter(words))

In [12]:
with open("filtered_words.txt", "r") as f:
    filtered_words = [line.strip() for line in f]

In [16]:
rows = []
cols = []
data = []

for i, word_count in enumerate(word_counts):
    for word, count in word_count.items():
        if word in filtered_words:
            rows.append(i)
            cols.append(filtered_words.index(word))
            data.append(count)

In [17]:
sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(word_counts), len(filtered_words)))
save_npz(f'sparse_matrix.npz', sparse_matrix)

In [18]:
from sklearn.decomposition import TruncatedSVD
import joblib

svd = joblib.load('pca_model.pkl')

In [24]:
sparse_matrix.shape

(514590, 34223)

In [25]:
principal_components = svd.transform(sparse_matrix[:10000,:])[:,:87]
for i in range(1,51):
    principal_components = np.vstack((principal_components, svd.transform(sparse_matrix[(i*10000):((i+1)*10000),:])[:,:87]))
principal_components = np.vstack((principal_components, svd.transform(sparse_matrix[510000:,:])[:,:87]))
np.save(f'principal_components.npy', principal_components)

In [26]:
principal_components.shape

(514590, 87)

In [47]:
ep = episodes_data[episodes_data['episode_id'] == '2z9c5FLNmTs9WiJckkkN8a'].iloc[0,]
ep

episode_id                                        2z9c5FLNmTs9WiJckkkN8a
episode_name               Can't Sleep? Catch Some ZZZs With These Songs
episode_description    There are songs you want to groove to, and son...
show_id                                           25auwWceZnlOEK6SwtO1y1
show_category                                       Arts & Entertainment
Name: 69, dtype: object

In [50]:
episodes_data['episode_description'][69]

"There are songs you want to groove to, and songs you want to pump up to. But sometimes, you just want something for the come down at the end of a long day. If you're having trouble sleeping in silence, or are just looking for some new songs to throw into your bedtime rotation, we've got you covered. In this encore episode, we are recommending three great songs that will help you fall asleep.Learn more about sponsor message choices: podcastchoices.com/adchoicesNPR Privacy Policy"

In [51]:
principal_components[69,:]

array([ 0.92447498,  0.11087223,  0.00964374, -0.43205882,  0.00540736,
       -0.20857631, -0.13287682,  0.07703931, -0.61127006,  0.49465513,
       -0.10794985, -0.22574975,  0.04410486, -0.14016755,  0.14687815,
       -0.15791277, -0.26080876,  0.44781955, -0.29916051,  0.04655094,
        0.10013883,  0.35290107,  0.30506445, -0.36628398, -0.00452934,
        0.0044764 ,  0.31202182,  0.26034715, -0.18818519, -0.40737128,
       -0.06068705, -0.52062875, -0.23136405,  0.08102843, -0.11196589,
       -0.03433976,  0.30221783,  0.1147293 ,  0.2371454 ,  0.54735898,
       -0.06110954,  0.1025821 , -0.07493729, -0.26182426,  0.16824977,
        0.25802496, -0.07735308,  0.53406352, -0.1392262 ,  0.31243761,
       -0.37457669,  0.0061083 ,  0.27153899,  0.07609294,  0.52573327,
        0.06143376,  0.26388408, -0.02501677,  0.70305915,  0.1710601 ,
       -0.47585088, -0.2777896 , -0.35731282, -0.1387121 ,  0.12622612,
       -0.55617482, -0.35518972,  0.02920096, -0.64451409,  0.16

In [38]:
principal_components.mean(0)

array([ 1.85090474e+00, -1.26929851e-01, -1.41556679e-02, -2.57917788e-01,
        1.86453341e-01, -1.70983163e-01,  6.44366572e-02, -1.15451558e-02,
       -5.48560436e-02, -1.09189907e-02, -6.47983883e-02, -3.31990473e-02,
        2.75181181e-02, -3.61555441e-02,  4.45438590e-02, -2.36750163e-02,
       -2.50230263e-02, -4.35175640e-02,  2.91503061e-02, -1.52768880e-02,
        1.54456403e-02, -2.41083124e-02, -3.48035747e-02,  2.73956493e-02,
       -7.74651463e-03,  2.58101074e-02, -3.62432233e-03,  6.60668813e-03,
        4.01567497e-02,  6.39757118e-03,  9.17174527e-03, -3.21031836e-02,
        3.94028214e-03, -2.09225994e-03, -1.11201345e-02, -2.81598239e-02,
        2.17205119e-02, -5.44656417e-02,  1.66167588e-02,  1.54294048e-02,
       -6.38646157e-02,  5.44461140e-03,  4.05712198e-03, -5.91578738e-04,
       -7.09833448e-03,  2.87582739e-03, -1.04515682e-03, -2.99701184e-02,
        4.09867103e-03, -1.53019876e-03,  1.48631019e-02, -1.22699566e-02,
        9.01931410e-03, -