<a href="https://colab.research.google.com/github/zbutton314/CS-5560/blob/main/code/ICP_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 6.0 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=9e844f0300601f695cee0b912dd4d2bc4da6c4e6e8bb169f5926d67c67714487
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1


In [42]:
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
import time
import gensim
import pyLDAvis
import pyLDAvis.gensim_models
#import pyLDAvis.gensim
from gensim.models import CoherenceModel
from google.colab import drive
import warnings
from IPython.display import display
import numpy as np

drive.mount('/content/drive')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation

In [19]:
def import_data(path):
  reviews_df = pd.read_csv(path, error_bad_lines=False)
  return reviews_df

In [7]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return (text)

In [9]:
def remove_stop_words(text):
  stop_words = stopwords.words('english')
  # adding some more stop words that doesn't convey much meaning in terms of reviews feel free to extend or reduce this list
  stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 
                   'good', 'go', 'get', 'do','took','time','year',
                 'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather',
                  'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 
                   'even', 'right', 'line','even', 'also', 'may', 'take', 'come',
                   'new','said', 'like','people'])
  return [word for word in text if word not in stop_words]

In [10]:
def stem_words(text):
    """
    Function to stem words
    """
    stemmer = PorterStemmer()
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # no single letter words
    except IndexError:
        pass

    return text

In [11]:
def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

In [22]:
def prepare_data(path):
  # Import and clean data
  reviews_df = import_data(path)
  reviews_df['tokenized_reviews'] = reviews_df['Reviews'].apply(apply_all)

  # Prepare the dictionary/corpus
  tokenized = reviews_df['tokenized_reviews']
  dictionary = corpora.Dictionary(tokenized)
  dictionary.filter_extremes(no_below=1, no_above=0.8)
  corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]

  return tokenized, dictionary, corpus


In [60]:
icp6_data_path = "/content/drive/MyDrive/data/ICP-6/reviews.csv"
tokenized, dictionary, corpus = prepare_data(icp6_data_path)

print(corpus[:1])
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1)]]
[[('big', 1), ('comfort', 1), ('definit', 1), ('instead', 1), ('kindl', 1), ('palm', 1), ('paper', 1), ('paperwhit', 1), ('read', 1), ('recommend', 1), ('regular', 1), ('small', 2), ('thought', 1), ('turn', 1)]]


# Modeling and Visualization Functions

In [46]:
def build_model(tokenized, dictionary, corpus, **params):
  # Build and save model
  ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, **params)
  ldamodel.save('model_combined.gensim')

  # Calculate coherence
  coherence_lda = []
  for measure in ["c_v", "c_uci", "u_mass", "c_npmi"]:
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized, dictionary=dictionary, coherence=measure)
    coherence_lda.append(coherence_model_lda.get_coherence())

  coherence_avg = np.mean(coherence_lda)

  return coherence_avg

In [31]:
def run_lda_viz(model_path, dictionary, corpus):
  lda_viz = gensim.models.ldamodel.LdaModel.load('/content/model_combined.gensim')
  lda_display = pyLDAvis.gensim_models.prepare(lda_viz, corpus, dictionary, sort_topics=True)
  
  return lda_display

# Tuning Hyperparameters via Grid Search

In [59]:
t1 = time.time()
coherence_avg_list = []
i = 1
for num_topics in [3, 7, 11, 15]:
  for passes in [1, 5, 10, 15]:
    for decay in [0.5, 0.67, 0.83, 1.0]:
      params = {
          "alpha": "auto",
          "eta": "auto",
          "num_topics": num_topics,
          "passes": passes,
          "decay": decay
      }
      coherence_avg = build_model(tokenized, dictionary, corpus, **params)
      print(f"PARAMETER SETTING {i}: (num_topics, passes, decay) = ({num_topics}, {passes}, {decay})")
      print(f"-- Avg Coherence Score: {coherence_avg}")

      coherence_avg_list.append(coherence_avg)
      i += 1

      # # This will display the viz within the loop, but it's not ideal for 64 hyperparameter settings
      # # The viz can be evaluated for the winning hyperparameter combination below
      # model_path = "/content/model_combined.gensim"
      # lda_display = run_lda_viz(model_path, dictionary, corpus)
      # display(pyLDAvis.display(lda_display))

print(f"\n\n TOTAL TIME: {time.time() - t1}")
print(coherence_avg_list)

PARAMETER SETTING 1: (num_topics, passes, decay) = (3, 1, 0.5)
-- Avg Coherence Score: -0.5352789331185692
PARAMETER SETTING 2: (num_topics, passes, decay) = (3, 1, 0.67)
-- Avg Coherence Score: -0.5201890329948292
PARAMETER SETTING 3: (num_topics, passes, decay) = (3, 1, 0.83)
-- Avg Coherence Score: -0.5633940528955523
PARAMETER SETTING 4: (num_topics, passes, decay) = (3, 1, 1.0)
-- Avg Coherence Score: -0.5315765616955025
PARAMETER SETTING 5: (num_topics, passes, decay) = (3, 5, 0.5)
-- Avg Coherence Score: -0.5245060190580774
PARAMETER SETTING 6: (num_topics, passes, decay) = (3, 5, 0.67)
-- Avg Coherence Score: -0.5791362249857772
PARAMETER SETTING 7: (num_topics, passes, decay) = (3, 5, 0.83)
-- Avg Coherence Score: -0.4840571419837106
PARAMETER SETTING 8: (num_topics, passes, decay) = (3, 5, 1.0)
-- Avg Coherence Score: -0.520224364597238
PARAMETER SETTING 9: (num_topics, passes, decay) = (3, 10, 0.5)
-- Avg Coherence Score: -0.4810832098731124
PARAMETER SETTING 10: (num_topics

# Find and Output Best Results

In [62]:
m = np.max(coherence_avg_list)
(coherence_avg_list.index(m)+1, m)

(14, -0.4562766293979504)

In [63]:
def run_best_model(tokenized, dictionary, corpus, num_topics, passes, decay):
  params = {
          "alpha": "auto",
          "eta": "auto",
          "num_topics": num_topics,
          "passes": passes,
          "decay": decay
      }
  coherence_score = build_model(tokenized, dictionary, corpus, **params)
  print(f"PARAMETER SETTING: (num_topics, passes, decay) = ({num_topics}, {passes}, {decay})")
  print(f"-- Coherence Score: {coherence_score}")

  model_path = "/content/model_combined.gensim"
  lda_display = run_lda_viz(model_path, dictionary, corpus)
  display(pyLDAvis.display(lda_display))

In [64]:
num_topics = 3
passes = 15
decay = 0.67

run_best_model(tokenized, dictionary, corpus, num_topics, passes, decay)

PARAMETER SETTING: (num_topics, passes, decay) = (3, 15, 0.67)
-- Coherence Score: -0.4782219605768235
