In [1]:
# Imports. Nothing to see here.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string

from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
import random

import sqlite3

In [2]:
# databases within ['categories', 'podcasts', 'reviews', 'runs']

cnx = sqlite3.connect('D:\Flatiron\Capstone\database.sqlite')

podcast = pd.read_sql_query("SELECT * FROM podcasts", cnx)

categories = pd.read_sql_query("SELECT * FROM categories", cnx)

runs = pd.read_sql_query("SELECT * FROM runs", cnx)

reviews = pd.read_sql_query("SELECT * FROM reviews", cnx)

In [3]:
explore = pd.read_sql_query('''SELECT p.podcast_id, AVG(rating) 
                            FROM podcasts p 
                            JOIN categories c 
                            ON p.podcast_id = c.podcast_id 
                            JOIN reviews r 
                            ON p.podcast_id = r.podcast_id 
                            GROUP BY p.podcast_id
                            ORDER BY rating
                            ''', cnx)
explore


Unnamed: 0,podcast_id,AVG(rating)
0,a0004b1ef445af9dc84dad1e7821b1e3,1.000000
1,a00c300fce2e20fe832f5f5e6148987c,4.444444
2,a00cb09b7a0e02c88a2c26ab08236296,4.200000
3,a00d083d9ad46e4a014a9903c046d544,4.925170
4,a00e38f15aac723b046dcbffc21aca27,4.710526
...,...,...
46660,fffdfb5b49d0d47943e09f6213a346e5,5.000000
46661,fffe308414050768d3ce3782aa503b7d,5.000000
46662,fffe3f208a56dfecfaf6d0a7f8399d63,5.000000
46663,ffff66f98c1adfc8d0d6c41bb8facfd0,5.000000


In [4]:
data = pd.read_sql_query("SELECT * FROM podcasts p JOIN categories c ON p.podcast_id = c.podcast_id JOIN reviews r ON p.podcast_id = r.podcast_id", cnx)

In [5]:
data.drop(['itunes_url','itunes_id', 'created_at', 'title', 'podcast_id'], axis=1, inplace=True)

In [6]:
print(data.isna().sum())

slug        0
category    0
content     0
rating      0
dtype: int64


In [7]:
# Check for duplicated rows and preserve unique entries.

a = len(data)
data = data.drop_duplicates()
b = len(data)
print('# Number of duplicate rows dropped: {}'.format(a-b))

# Number of duplicate rows dropped: 2968


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1422197 entries, 0 to 1425164
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   slug      1422197 non-null  object
 1   category  1422197 non-null  object
 2   content   1422197 non-null  object
 3   rating    1422197 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 54.3+ MB


In [9]:
data.shape

(1422197, 4)

In [10]:
data['category'].unique()

array(['arts', 'arts-performing-arts', 'music', 'arts-design',
       'education', 'society-culture', 'arts-visual-arts', 'technology',
       'arts-food', 'society-culture-personal-journals', 'comedy',
       'arts-fashion-beauty', 'tv-film', 'society-culture-places-travel',
       'kids-family', 'religion-spirituality', 'business',
       'society-culture-philosophy', 'spirituality', 'business-careers',
       'christianity', 'hinduism', 'business-investing', 'judaism',
       'islam', 'buddhism'], dtype=object)

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
stops += list(string.punctuation)

In [12]:
data

Unnamed: 0,slug,category,content,rating
0,backstage-at-tilles-center,arts,Thanks for providing these insights. Really e...,5
1,backstage-at-tilles-center,arts-performing-arts,Thanks for providing these insights. Really e...,5
2,backstage-at-tilles-center,music,Thanks for providing these insights. Really e...,5
3,backstage-at-tilles-center,arts,Super excited to see this podcast grow. So man...,5
4,backstage-at-tilles-center,arts-performing-arts,Super excited to see this podcast grow. So man...,5
...,...,...,...,...
1425160,what-if-world-stories-for-kids,kids-family,I made this so all you guys that listen to thi...,5
1425161,inbox-besties-w-kate-doster-email-marketing-po...,business,I just discovered Kate Doster and the hype is ...,5
1425162,the-trypod,comedy,This podcast as well as the “you can sit with ...,1
1425163,the-trypod,comedy,Really crappy ep 2/4. I guess they can’t all b...,1


In [13]:
data ['slug'] = data['slug'].apply(lambda x: x.replace('-',' '))
data ['category'] = data['category'].apply(lambda x: x.replace('-',' '))
data ['category'] = data['category'].apply(lambda x: x.split()[0])
data

Unnamed: 0,slug,category,content,rating
0,backstage at tilles center,arts,Thanks for providing these insights. Really e...,5
1,backstage at tilles center,arts,Thanks for providing these insights. Really e...,5
2,backstage at tilles center,music,Thanks for providing these insights. Really e...,5
3,backstage at tilles center,arts,Super excited to see this podcast grow. So man...,5
4,backstage at tilles center,arts,Super excited to see this podcast grow. So man...,5
...,...,...,...,...
1425160,what if world stories for kids,kids,I made this so all you guys that listen to thi...,5
1425161,inbox besties w kate doster email marketing po...,business,I just discovered Kate Doster and the hype is ...,5
1425162,the trypod,comedy,This podcast as well as the “you can sit with ...,1
1425163,the trypod,comedy,Really crappy ep 2/4. I guess they can’t all b...,1


In [14]:
data['all'] = data['slug'] + ' ' + data['content']

In [15]:
data['all'] = data['all'].apply(tokenizer.tokenize)

data['all'] = data['all'].apply(lambda x: [word.lower() for word in x if word.lower() not in stops])

In [16]:
data.drop(['slug', 'content'], axis = 1, inplace=True)

In [17]:
data

Unnamed: 0,category,rating,all
0,arts,5,"[backstage, tilles, center, thanks, providing,..."
1,arts,5,"[backstage, tilles, center, thanks, providing,..."
2,music,5,"[backstage, tilles, center, thanks, providing,..."
3,arts,5,"[backstage, tilles, center, super, excited, se..."
4,arts,5,"[backstage, tilles, center, super, excited, se..."
...,...,...,...
1425160,kids,5,"[world, stories, kids, made, guys, listen, pod..."
1425161,business,5,"[inbox, besties, w, kate, doster, email, marke..."
1425162,comedy,1,"[trypod, podcast, well, sit, us, podcast, turn..."
1425163,comedy,1,"[trypod, really, crappy, ep, 2, 4, guess, winn..."


In [18]:
podcast['podcast_id'].nunique()

46665

In [19]:
############################ Visualize this ##############################

data['rating'].value_counts()

5    1265760
1      60381
4      41631
3      29006
2      25419
Name: rating, dtype: int64

# Functions

In [20]:
# Count Vectorize or TF-IDF. Our first and most vital choice.
def CV(X_train, X_test):
    count_vectorizer = CountVectorizer()
    X_train_counts = count_vectorizer.fit_transform(X_train)
    X_test_counts = count_vectorizer.transform(X_test)
    return X_train_counts, X_test_counts

def tf_idf(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_counts = tfidf.fit_transform(X_train)
    X_test_counts = tfidf.transform(X_test)
    return X_test_counts, X_train_counts


In [21]:
# Lemmatizing will or won't happen. Two functions, one nested inside the other.
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

def lemmatize():
    data['lemm'] = data['tokens'].apply(lemmatize_text)
    data['lemm'] = data['lemm'].apply(lambda x: ' '.join(x))

In [22]:
# SMOTE either will or will not run.
def smote(X_train_counts, y_train):
    smote = SMOTE()
    X_train_counts, y_train = smote.fit_sample(X_train_counts, y_train)
    return X_train_counts, y_train

# Train Test Split. The col variable is important and will be different depending on whether we lemmatize.
def TTS(col):
    X_train, X_test, y_train, y_test = train_test_split(data[col], data['feelings'])
    return X_train, X_test, y_train, y_test

In [23]:
# SMOTE either will or will not run.
def smote(X_train_counts, y_train):
    smote = SMOTE()
    X_train_counts, y_train = smote.fit_sample(X_train_counts, y_train)
    return X_train_counts, y_train

# Train Test Split. The col variable is important and will be different depending on whether we lemmatize.
def TTS(col):
    X_train, X_test, y_train, y_test = train_test_split(data[col], data['feelings'])
    return X_train, X_test, y_train, y_test

# Back to work

In [24]:
podcast

Unnamed: 0,podcast_id,itunes_id,slug,itunes_url,title
0,a00018b54eb342567c94dacfb2a3e504,1313466221,scaling-global,https://podcasts.apple.com/us/podcast/scaling-...,Scaling Global
1,a00043d34e734b09246d17dc5d56f63c,158973461,cornerstone-baptist-church-of-orlando,https://podcasts.apple.com/us/podcast/cornerst...,Cornerstone Baptist Church of Orlando
2,a0004b1ef445af9dc84dad1e7821b1e3,139076942,mystery-dancing-in-the-dark,https://podcasts.apple.com/us/podcast/mystery-...,Mystery: Dancing in the Dark
3,a00071f9aaae9ac725c3a586701abf4d,1332508972,kts-money-matters,https://podcasts.apple.com/us/podcast/kts-mone...,KTs Money Matters
4,a000aa69852b276565c4f5eb9cdd999b,1342447811,speedway-soccer,https://podcasts.apple.com/us/podcast/speedway...,Speedway Soccer
...,...,...,...,...,...
46660,fffe3f208a56dfecfaf6d0a7f8399d63,1420703219,how-travel-writers-self-publish,https://podcasts.apple.com/us/podcast/how-trav...,How Travel Writers Self-Publish
46661,fffeb7d6d05f2b4c600fbebc828ca656,1220681898,teddy-the-empress-cooking-the-queens,https://podcasts.apple.com/us/podcast/teddy-th...,TEDDY & THE EMPRESS: Cooking the Queens
46662,ffff5db4b5db2d860c49749e5de8a36d,384521934,frankenstein-or-the-modern-prometheus,https://podcasts.apple.com/us/podcast/frankens...,"Frankenstein, or the Modern Prometheus"
46663,ffff66f98c1adfc8d0d6c41bb8facfd0,1435731839,whos-bringing-wine,https://podcasts.apple.com/us/podcast/whos-bri...,Who’s Bringing Wine?
