The notebook is divided into three parts:

**Section 1:** Data set-up. This part is to set up the clean-all data that is going to be used to train the model with. This separate step enables the change in data source.

**Section 2:** Training model (with test report). This part is to generate a machine learning model that is to be used in the last section to predict the category. This step enables the change in model used (Logistic Regression or Naive Bayes..) and change in the category that needs to be predicted.

**Section 3:** A small helper function to predict the category. This steps enables testing on multiple data.

In [None]:
!pip install youtube_transcript_api
import psycopg2
import pandas as pd
import pandas_gbq
import numpy as np
import json
import nltk
import re
import pickle
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer as tf
from sklearn import feature_extraction, model_selection, feature_selection, pipeline, metrics
#from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
conn_string = "host="+ 'lifeview-dev.c7b1kmie3jdi.us-west-2.rds.amazonaws.com' + " port=" + "5432" + " dbname="+ 'lifeview_content' + " user=" + 'interns' + " password=" + 'FoolishSamba66^^'

In [None]:
# testing out audience
classifier_type = "Audience"
entry_id = "44SXOeFWwh0L8uX32lrNwA"
categories = ['Teens','College Students','Parents']

#testing out topics
#classifier_type = "Topics"
#categories = ['Mental Health', 'Personal Development', 'Relationships']
#entry_id = '2LQJ7KQATrqaiJGKucgXCZ'

#**Section 1 : Helper Functions for Cleaning Dataframe**

Convert JSON format to interpretable data

In [None]:
#"data_convert" method
def data_convert(df):
  if (classifier_type == 'Audience'):
    data_list = list(df.loc[:, "audiencetype"].values)
  if (classifier_type == 'Topics'):
    data_list = list(df.loc[:, "topics"].values)

  entries = []
  for a in data_list:
    if a == None:
      entries.append('None')
    if a != None:
      entries.append(json.loads(a))
  new_data = []
  for a in entries:
    if a == 'None':
      new_data.append('None')
    else:
      if len(a) > 1:
        new_data.append(", ".join([a[b].get('sys').get('id') for b in range(len(a))]))
      else:
        new_data.append(a[0].get('sys').get('id'))

  if (classifier_type == 'Audience'):
    df['audiencetype'] = new_data
    df_split = df.assign(audiencetype=df['audiencetype'].str.split(', ')).explode('audiencetype')
  if (classifier_type == 'Topics'):
    df['topics'] = new_data
    df_split = df.assign(topics=df['topics'].str.split(', ')).explode('topics')

  return df_split

In [None]:
#"desc_convert" helper function
def getVal(data):
    this = ''
    if(type(data) is dict and data.get('value') == None):
     return getVal(data.get('content'))
    if(type(data) is list):
       for a in data:
        if type(a) is dict and a.get('value') != None:
          this += a.get('value')
        else:
         while type(a) is dict and a.get('value') == None:
           a = a.get('content')
         if a[0].get('value') == None:
           return this
         this += a[0].get('value')
    return this

In [None]:
#"desc_convert" function
def desc_convert(df):
  #desc = pd.read_sql_query("SELECT description FROM video_flat", con=conn)
  descrip = list(df.loc[:, "description"].values)
  desc_entries = []
  for a in descrip:
    if a == None:
      desc_entries.append('None')
    if a != None:
      desc_entries.append(json.loads(a))
  new_desc = []
  for a in desc_entries:
    if a == 'None':
      new_desc.append('None')
    if(a != 'None'):
      b = a.get('content')
      new_desc.append(" ".join([getVal(c) for c in b]))
  df['description'] = new_desc
  return df

In [None]:
# Clean the table to have data necessary for training classification models
# Then merge the topic ids to match with their actual name
def clean_merge(df, classes):
  df_rev = df.loc[df['reviewstatus'] == 'Accepted']
  df_re = df_rev.drop(columns = ['reviewstatus'])
  if (classifier_type == 'Audience'):
    df_none = df_re.loc[df_re['audiencetype'] != 'None']
    merged = pd.merge(classes, df_none, on='audiencetype', how='right')
    clean = merged.drop(columns = 'audiencetype')
    df_merge = clean.rename(columns = {'name': 'audiencetype', 'title_y': 'title'})
    df_clean = df_merge.rename(columns = {'name_x': 'audiencetype', 'name_y': 'title'}) #needed to rename df_apps dataframe

  if (classifier_type == 'Topics'):
    df_none = df_re.loc[df_re['topics'] != 'None']
    merged = pd.merge(classes, df_none, on='topics', how='right')
    clean = merged.drop(columns = 'topics')
    df_clean = clean.rename(columns = {'title_x': 'topics', 'title_y': 'title'})

  return df_clean

In [None]:
def preprocess_text(text, lst_stopwords=None):
    # clean (convert to lowercase and remove punctuations and characters and then strip)
    text1 = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text2 = re.sub('[0-9]+', '', text1)
    # Tokenize (convert from string to list)
    lst_text = text2.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    # back to string from list
    text = " ".join(lst_text)
    return text

#**Section 2 : Function for creating model with feature selection (and testing)**

In [None]:
def feature_selection_cm(df, classifier, print_detail):
  X = df.text.values
  if (classifier_type == 'Audience'):
     y = df.audiencetype.values
  if (classifier_type == 'Topics'):
     y = df.topics.values
  df_train, df_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)

  vectorizer = feature_extraction.text.TfidfVectorizer()
  corpus = df_train
  vectorizer.fit(corpus)
  X_train = vectorizer.transform(corpus)

  y = y_train
  X_names = vectorizer.get_feature_names()
  p_value_limit = 0.85
  dtf_features = pd.DataFrame()

  for cat in np.unique(y):
      chi2, p = feature_selection.chi2(X_train, y==cat)
      dtf_features = dtf_features.append(pd.DataFrame(
                    {"feature":X_names, "score":1-p, "y":cat}))
      dtf_features = dtf_features.sort_values(["y","score"],
                      ascending=[True,False])
      dtf_features = dtf_features[dtf_features["score"]>p_value_limit]

  X_names = dtf_features["feature"].unique().tolist()

  vectorizer = tf(vocabulary=X_names)
  vectorizer.fit(corpus)
  X_train = vectorizer.transform(corpus)

  model = pipeline.Pipeline([("vectorizer", vectorizer),
                            ("classifier", classifier)])## train classifier
  model["classifier"].fit(X_train, y_train)## test
  predicted = model.predict(df_test)
  predicted_prob = model.predict_proba(df_test)

  classes = np.unique(y_test)
  y_test_array = pd.get_dummies(y_test, drop_first=False).values
    ## Accuracy, Precision, Recall
  accuracy = metrics.accuracy_score(y_test, predicted)

  if (print_detail == True):
    print_details(accuracy, y_test, predicted)

  import pickle
  filename = 'finalized_model.sav'
  pickle.dump(model, open(filename, 'wb'))

  return vectorizer, model

In [None]:
def print_details(accuracy, y_test, predicted):
  print("Accuracy:",  round(accuracy,2))
  print("Detail:")
  print(metrics.classification_report(y_test, predicted))

# **Section 3 : Function for loading saved model to predict output (for single entry_id)**

In [None]:
def predict_output(df, id):
  #vectorizer, model = feature_selection_cm(df, classifier = LogisticRegression(warm_start=True), print_detail = print_detail)
  loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
  new_df = df[df['entry_id'] == id]
  X = new_df.text.tolist()
  vectorizer.fit_transform(X)
  output = loaded_model.predict(X)
  return output[0]

#**Main function**

In [None]:
def at_topics_model(conn, classifier_type, entry_id, categories):
  #Connect to Postgre to get data (all are flat tables)
  conn = psycopg2.connect(conn)
  if (classifier_type == 'Audience'):
    df_videos = pd.read_sql_query('SELECT entry_id, title, reviewstatus, audiencetype, description, shortdescription, youtubeid FROM video_flat', con=conn)
    df_podcasts = pd.read_sql_query('SELECT entry_id, title, reviewstatus, audiencetype, description, shortdescription, listennotesdescription FROM podcast_flat', con=conn)
    df_books = pd.read_sql_query('SELECT entry_id, title, reviewstatus, audiencetype, description, goodreadsdescription FROM book_flat', con=conn)
    df_apps = pd.read_sql_query('SELECT entry_id, name, reviewstatus, audiencetype, description, shortdescription, highlights FROM application_flat', con=conn)
    audience = pd.read_sql_query("SELECT entry_id, ((audience.raw -> 'fields'::text) -> 'name'::text) ->> 'en-US'::text AS name FROM audience", con=conn)
    audience = audience.rename(columns={'entry_id':'audiencetype'})
  if (classifier_type == 'Topics'):
    df_videos = pd.read_sql_query('SELECT entry_id, title, reviewstatus, topics, description, shortdescription, youtubeid FROM video_flat', con=conn)
    df_podcasts = pd.read_sql_query('SELECT entry_id, title, reviewstatus, topics, description, shortdescription, listennotesdescription FROM podcast_flat', con=conn)
    df_books = pd.read_sql_query('SELECT entry_id, title, reviewstatus, topics, description, goodreadsdescription FROM book_flat', con=conn)
    df_apps = pd.read_sql_query('SELECT entry_id, name, reviewstatus, topics, description, shortdescription, highlights FROM application_flat', con=conn)
    df_apps = df_apps.rename(columns = {'name': 'title'})
    topics = pd.read_sql_query("SELECT entry_id, title FROM topics_flat", con=conn)
    topics = topics.rename(columns={'entry_id':'topics'})

  #Convert JSON Format to interretable data
  df_videos = data_convert(df_videos)
  df_books = data_convert(df_books)
  df_podcasts = data_convert(df_podcasts)
  df_apps = data_convert(df_apps)

  df_videos = desc_convert(df_videos)
  df_books = desc_convert(df_books)
  df_podcasts = desc_convert(df_podcasts)
  df_apps = desc_convert(df_apps)

  #Clean pandas dataframe
  if (classifier_type == 'Audience'):
    df_videos = clean_merge(df_videos, audience)
    df_books = clean_merge(df_books, audience)
    df_podcasts = clean_merge(df_podcasts, audience)
    df_apps = clean_merge(df_apps, audience)
  if (classifier_type == 'Topics'):
    df_videos = clean_merge(df_videos, topics)
    df_books = clean_merge(df_books, topics)
    df_podcasts = clean_merge(df_podcasts, topics)
    df_apps = clean_merge(df_apps, topics)

  #Preprocess Text
  df_videos = df_videos.fillna(value='None')
  df_podcasts = df_podcasts.fillna(value='None')
  df_books = df_books.fillna(value='None')
  df_apps = df_apps.fillna(value='None')

  text_videos = df_videos.title + ' ' + df_videos.description + ' ' + df_videos.shortdescription #+ ' ' + df_videos.transcript
  text_podcasts = df_podcasts.title + ' ' + df_podcasts.description + ' ' + df_podcasts.shortdescription + ' ' + df_podcasts.listennotesdescription
  text_books = df_books.title + ' ' + df_books.description + ' ' + df_books.goodreadsdescription
  text_apps = df_apps.title + ' ' + df_apps.description + ' ' + df_apps.shortdescription + df_apps.highlights
  df_videos['text'] = text_videos.to_list()
  df_podcasts['text'] = text_podcasts.to_list()
  df_books['text'] = text_books.to_list()
  df_apps['text'] = text_apps.to_list()

  lst_stopwords = nltk.corpus.stopwords.words("english")
  df_videos["text"] = df_videos["text"].apply(lambda x: preprocess_text(x, lst_stopwords=lst_stopwords))
  df_podcasts["text"] = df_podcasts["text"].apply(lambda x: preprocess_text(x, lst_stopwords=lst_stopwords))
  df_books["text"] = df_books["text"].apply(lambda x: preprocess_text(x, lst_stopwords=lst_stopwords))
  df_apps["text"] = df_apps["text"].apply(lambda x: preprocess_text(x, lst_stopwords=lst_stopwords))

  frames = [df_videos, df_podcasts, df_books, df_apps]
  df_clean_all = pd.concat(frames)

  # transform data into only the categories we want for training
  if classifier_type == 'Audience':
    df = df_clean_all.loc[df_clean_all['audiencetype'].isin(categories)]
  if classifier_type == 'Topics':
    df = df_clean_all.loc[df_clean_all['topics'].isin(categories)]

  return df

# **Testing Functions/Implementation**

In [None]:
# takes in string value for connection to PostgreSQL data,
# the string value for which classification model you want to train ('Audience' or 'Topics'),
# string entry_id of a data entry that you want predicted value of ('2LQJ7KQATrqaiJGKucgXCZ'),
# and array of categories/classes you want to train (['Depression', 'Anxiety'])
# returns clean dataframe

df = at_topics_model(conn_string, classifier_type, entry_id, categories)

In [None]:
# creates and saves the model
vectorizer, model = feature_selection_cm(df, classifier = LogisticRegression(warm_start=True), print_detail = True)

Accuracy: 0.75
Detail:
                  precision    recall  f1-score   support

College Students       0.29      0.06      0.10        33
         Parents       0.87      0.88      0.87       114
           Teens       0.65      0.84      0.74        88

        accuracy                           0.75       235
       macro avg       0.60      0.59      0.57       235
    weighted avg       0.71      0.75      0.71       235



In [None]:
# loads saved model, predicts desired output for entry_id given
output = predict_output(df, entry_id)
output

'College Students'