In [1]:
# Imports. Nothing to see here.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string

from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
import random

import sqlite3
from textblob import TextBlob

In [None]:
# databases within ['categories', 'podcasts', 'reviews', 'runs']

cnx = sqlite3.connect('D:\Flatiron\Capstone\database.sqlite')

podcast = pd.read_sql_query("SELECT * FROM podcasts", cnx)

categories = pd.read_sql_query("SELECT * FROM categories", cnx)

runs = pd.read_sql_query("SELECT * FROM runs", cnx)

reviews = pd.read_sql_query("SELECT * FROM reviews", cnx)

In [None]:
explore = pd.read_sql_query('''SELECT p.podcast_id, AVG(rating)
                            FROM podcasts p 
                            JOIN categories c 
                            ON p.podcast_id = c.podcast_id 
                            JOIN reviews r 
                            ON p.podcast_id = r.podcast_id 
                            GROUP BY p.podcast_id
                            ORDER BY AVG(rating)
                            ''', cnx)
explore


In [None]:
data = pd.read_sql_query("SELECT * FROM podcasts p JOIN categories c USING (podcast_id) JOIN reviews r USING (podcast_id)", cnx)

In [None]:
data

In [None]:
# plt.hist(data['podcast_id'])
# plt.show()

In [None]:
print(data.shape)
data['podcast_id'].nunique()

In [None]:
data.drop(['itunes_url','itunes_id', 'created_at', 'title', 'podcast_id'], axis=1, inplace=True)

In [None]:
print(data.isna().sum())

In [None]:
# Check for duplicated rows and preserve unique entries.

a = len(data)
data = data.drop_duplicates()
b = len(data)
print('# Number of duplicate rows dropped: {}'.format(a-b))

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data['category'].unique()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
stops += list(string.punctuation)

In [None]:
data

In [None]:
data ['slug'] = data['slug'].apply(lambda x: x.replace('-',' '))
data ['category'] = data['category'].apply(lambda x: x.replace('-',' '))
data ['category'] = data['category'].apply(lambda x: x.split()[0])
data

In [None]:
# data['all'] = data['slug'] + ' ' + data['content']

In [None]:
data['content'] = data['content'].apply(tokenizer.tokenize)

data['content'] = data['content'].apply(lambda x: [word.lower() for word in x if word.lower() not in stops])

In [None]:
# data.drop(['slug', 'content'], axis = 1, inplace=True)

In [None]:
data

In [None]:
podcast['podcast_id'].nunique()

In [None]:
############################ Visualize this ##############################

data['rating'].value_counts()

# Functions

In [None]:
def tf_idf(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_counts = tfidf.fit_transform(X_train)
    X_test_counts = tfidf.transform(X_test)
    return X_test_counts, X_train_counts

In [None]:
# Lemmatizing will or won't happen. Two functions, one nested inside the other.
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

def lemmatize():
    data['lemm'] = data['content'].apply(lemmatize_text)
    data['lemm'] = data['lemm'].apply(lambda x: ' '.join(x))

In [None]:
# SMOTE either will or will not run.
def smote(X_train_counts, y_train):
    smote = SMOTE()
    X_train_counts, y_train = smote.fit_sample(X_train_counts, y_train)
    return X_train_counts, y_train

# Train Test Split. The col variable is important and will be different depending on whether we lemmatize.
def TTS(col):
    X_train, X_test, y_train, y_test = train_test_split(data[col], data['feelings'])
    return X_train, X_test, y_train, y_test

In [None]:
lemmatize()

# TextBlob

In [None]:
# # Initializing the TextBlob sentiment for subj/polar(+/-)
# polarity = (lambda x: TextBlob(x).sentiment.polarity)
# subjectivity = (lambda x: TextBlob(x).sentiment.subjectivity)
# # Applying to dataframe column with cleaned/tokenized text
# data['polarity'] = data['lemm'].apply(polarity)
# data['subjectivity'] = data['lemm'].apply(subjectivity)


In [None]:
# # Check the output
# data[['polarity', 'subjectivity']][0:5]

In [None]:
# plt.bar(data.shape[0],data['polarity'])
# plt.show()

In [None]:
# # Returned as numbers on spectrum - polar is -1 to 1 scale
# # Subjectivity is 0 to 1
# # Bin into positive, negative, and neutral sentiments
# # Neutral is between -0.05 to 0.05 - this can be adjusted
# def get_sentiment(row):
#         if row > 0.05:
#             return 'Positive'
#         elif row < -0.05:
#             return 'Negative'
#         else:
#             return 'Neutral'
# data['polarity_label'] = data['polarity'].apply(get_sentiment)

In [None]:
# data.polarity_label.value_counts().plot(kind='bar', color='gold', alpha=0.6, figsize = (14, 8))
# plt.show()

In [None]:
tfidf = TfidfVectorizer()
tf = tfidf.fit_transform(data['lemm'])
tf

In [33]:
rf = RandomForestClassifier()
rf.fit(tf, data['rating'])
y_predicted_counts = rf.predict(tf)