In [None]:
############################## Required Packages ##############################
import pandas as pd
import numpy as np
import re
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.simplefilter("ignore", UserWarning)
from collections import Counter
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pyLDAvis.gensim
import pickle 
import pyLDAvis

#Read CSV file
data = pd.read_csv('reviews.csv',low_memory = False)
data = data.drop(columns=['id', 'name', 'asins', 'brand','categories', 'keys', 'manufacturer'], axis=1).sample(100)

# data['Text'] = \
data['Text'].map(lambda x: re.sub('[,\.!?]', '', x))# Convert the titles to lowercase
data['Text'] = \
data['Text'].map(lambda x: x.lower())# Print out the first rows of papers
# data['Text'].head())

In [None]:
# Function to remove punctuations, newlines, special characters
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield(sent)

# Function to remove stopwords 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]


# Adding data to a list
data = data.Text.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
# print(data_words[:1][0][:30])

# To build the bigram and trigram models # higher threshold fewer phrases for example cute_dog, little_cute_dog
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Function to process words by removing stopwords, by creating Bigrams and Trigrams and lemmatization
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords again after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

# Processed text data
data_ready = process_words(data_words) 

# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus:Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# To build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_model.print_topics())

In [None]:
# Function to extract dominant topic and associated keywords and it's percentage
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Percentage and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
print(df_dominant_topic.head(10))
doc_lens = [len(d) for d in df_dominant_topic.Text]

In [None]:
# Function to show samples of sentences that most represent a given topic
doc_lens = [len(d) for d in df_dominant_topic.Text]
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0) 
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
print(sent_topics_sorteddf_mallet.head(10))

# To plot the word counts and the weights of each keyword 
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])   

In [None]:
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2,2, figsize=(12,12), sharey=True, dpi=80)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.050); ax.set_ylim(0, 100)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=45, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [None]:
# To plot word clouds of Top 10 keywords in each topic
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(3, 3, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
#For analyzing LDA model results using pyLDAvis package
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
p


In [None]:
# To evaluate LDA model by computing Parameters, log likelihood score and model perplexity
#Vectorize data
vectorizer = CountVectorizer(analyzer='word',lowercase=True)
data_vectorized = vectorizer.fit_transform(data)

#Get several search params and learning decays for the LDA model
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .6, .7, .8, .9]}
lda = LatentDirichletAllocation(learning_method='online', learning_offset=10.0, random_state=0)
 
#Use GridSearchCV in order to run through to get the best param
#according to Log Likelihood and and Model Perplexity
model = GridSearchCV(lda, param_grid=search_params)
model.fit(data_vectorized)
best_lda_model = model.best_estimator_

print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))


In [None]:
#####################################################################################################
#LDA model using Food Reviews Dataset
#####################################################################################################
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
import gensim.corpora as corpora

#Get stopword list from file
def stopwords():
	stopwordList =[]
	with open("stopwords.txt",'r') as File:
		for line in File:
			for word in line.split():
				stopwordList.append(word.lower())
	return stopwordList

stop_words = stopwords()
stop_words.extend(['br'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

def remove_wordsgreaterthan(texts,length):
     return [[word for word in simple_preprocess(str(doc)) 
             if len(word)>length] for doc in texts]

    
data = pd.read_csv('FoodReviews.csv',low_memory = False)# Print head
print(data)

# Remove all columns exclusing the text
data.filter(items=['Text']) #'Id','Summary'

#Preprocess data
# Convert the text to lowercase and get rid of nonalphanumerics and get rid of numbers
data['Text'] = data['Text'].map(lambda x: re.sub('[,.!?@$-<>]', '', x))
data['Text'] = data['Text'].map(lambda x: x.lower())

#filter rows based on length of the review and then choose a sample size
data = data.loc[data['Text'].str.len() > 60].sample(1000)
data.filter('Text')
print(data['Text'])

data = data.Text.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)
data_words = remove_wordsgreaterthan(data_words,2)

#Using Gensim LDA Model
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

from pprint import pprint
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

In [None]:
# Graphical Representation of LDA using Food Reviews dataset
import pyLDAvis.gensim
import pickle 
import pyLDAvis
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
p

In [None]:
# LDA model evaluation using Food Reviews dataset
import numpy as np 
import pandas as pd
import re
import spacy
import gensim
from gensim.utils import simple_preprocess
import nltk
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import matplotlib.pyplot as plt

#functions
def stopwords():
	stopwordList =[]
	with open("stopwords.txt",'r') as File:
		for line in File:
			for word in line.split():
				stopwordList.append(word.lower())
		
	return stopwordList
	
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]
             
#Tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

#Data
data = pd.read_csv('FoodReviews.csv',low_memory = False)# Print head
stop_words = stopwords()

#Clean Data
data = data.loc[data['Text'].str.len() > 60]
data['Text'] = data['Text'].map(lambda x: re.sub('[,.!?@$-<>]', '', x)) #Remove misc
data['Text'] = data['Text'].map(lambda x: x.lower()) #Lowercase string
data['Text']= data['Text'].map(lambda x: re.sub(r'\s+', ' ', x))  #Remove double spaces/new lines

#Turn to List and remove stop words
data.Text.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)

#Vectorize data
vectorizer = CountVectorizer(analyzer='word',lowercase=True)
data_vectorized = vectorizer.fit_transform(data)

#Get several search params and learning decays for the LDA model
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .6, .7, .8, .9]}
lda = LatentDirichletAllocation(learning_method='online', learning_offset=10.0, random_state=0)
 
#Use GridSearchCV in order to run through to get the best param
#according to Log Likelihood and and Model Perplexity
model = GridSearchCV(lda, param_grid=search_params)
model.fit(data_vectorized)
best_lda_model = model.best_estimator_

print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))


In [None]:
#####################################################################################################
# Non Negative Matrix Factorization Using Datasets Amazon Product Reviews and Food Reviews
#####################################################################################################
import os
import pandas as pd
import re

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import numpy as np

import matplotlib.pyplot as plt

BASE_PATH = "archive"
DATASET_TYPE = "reviews"
NUM_TOPICS = 9

def get_files_to_read(directory_name):

    file_list = os.listdir(directory_name)
    return file_list

def read_file(base_path, dataset_type):
    """Returns df of specified review text from dataset

    Args:
        base_path (str): Base directory of file, ex: "archive"
        file_name (str): File name: "food.csv" or "reviews.csv"
        dataset_type (str): Specifies either food or amazon products

    Returns:
        [pandas df]: Dataframe of review text
    """      

    if dataset_type == "food":
        file_path = os.path.join(base_path, "food.csv")
        file_data = pd.read_csv(file_path)  
        review_data = file_data[['Text']].copy()
    else:
        file_path = os.path.join(base_path, "reviews.csv")
        file_data = pd.read_csv(file_path)  
        review_data = file_data[['reviews.text']].copy()    
    
    # add .sample(100) for testing
    print(review_data.head())

    return review_data

def prepare_text_regex(text_df, dataset_type):
    """Taxes text and applies regex filtering for words

    Args:
        text_df (dataframe): The dataframe of review text
        dataset_type (str): The type of dataset: "food" or "product"

    Returns:
        text_df (dataframe): Filtered email dataframe after regex
    """

    if dataset_type == "food":
        column_name = "Text"
    else:
        column_name = "reviews.text"

    text_df[column_name] = \
    text_df[column_name].map(lambda x: re.sub('[,\.!?]', '', x))
    text_df[column_name] = \
    text_df[column_name].map(lambda x: re.sub('www', '', x))
    text_df[column_name] = \
    text_df[column_name].map(lambda x: x.lower())
    text_df[column_name].head()

    return text_df

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts, stopwords):
    return [[word for word in simple_preprocess(str(doc)) 
            if word not in stopwords] for doc in texts]

def get_NMF_topics(model, vectorizer, top_word_num, num_topics):
    """Processes data using NMF model

    Args:
        model (NMF): NMF model class
        vectorizer (Vectorizer class): sklearn Vectorizer class
        top_word_num (int): Number of words to get for each topic
        num_topics (int): Number of topics to look for in dataset

    Returns:
        nmf_df (dataframe): Pandas dataframe containing topics and top words
    """
    feature_names = vectorizer.get_feature_names()
    top_words_dict = {}
    for i in range(num_topics):
        word_ids = model.components_[i].argsort()[:-top_word_num - 1:-1]
        words = [feature_names[key] for key in word_ids]
        words = [re.sub('\S*@\S*\s?', '', word) for word in words]
        words = [re.sub('\s+', ' ', word) for word in words]
        words = [re.sub("\'", "", word) for word in words]
        top_words_dict[f'Topic #{i+1}'] = words

    nmf_df = pd.DataFrame(top_words_dict)
    nmf_df.to_csv(f"nmf_{DATASET_TYPE}_topics.csv")
    
    return nmf_df

def get_nmf_weights_data(weights, features):
    features = np.array(features)
    sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) for wt, index in zip(weights, sorted_indices)])
    sorted_terms = np.array([list(features[row]) for row in sorted_indices])

    topics = [np.vstack((terms.T, term_weights.T)).T for terms, term_weights in zip(sorted_terms, sorted_weights)]
    # print("printing topics")
    # print(topics)
    return topics

def plot_words(model, feature_names, dataset_type, num_top_words=10):
    """Used to generate the topic plot after NMF processing

    Args:
        model (NMF): Scikit learn NMF model
        feature_names : The features generated from the NMF model
        num_top_words : Amount of top words for each topic (used 10 as default)
    """
    fig, axes = plt.subplots(3,3, figsize=(20,15), sharex=True)
    axes = axes.flatten()

    for topic_index, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-num_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_index]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic #{topic_index +1}',
                     fontdict={'fontsize': 25})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        if dataset_type == "food":
            fig.suptitle("NMF Model Topics -- Food Dataset", fontsize=30)
        else:
            fig.suptitle("NMF Model Topics -- Product Review Dataset", fontsize=30)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    # plt.show()

    plt.savefig(f"nmf_model_{dataset_type}_topics.png")


def main():

    # text_files = get_files_to_read(BASE_PATH)

    df = read_file(BASE_PATH, DATASET_TYPE)
    processed_df = prepare_text_regex(df, DATASET_TYPE)

    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'br'])

    # This is to differentiate the two datasets: amazon products and food reviews
    # since they have different column names for the text data
    if DATASET_TYPE == "food":
        data = processed_df["Text"].values.tolist()
    else:
        data = processed_df["reviews.text"].values.tolist()

    data_words = list(sent_to_words(data))# remove stop words
    data_words = remove_stopwords(data_words, stop_words)
    # print("printing datawords")
    # print(data_words[0])

    import gensim.corpora as corpora# Create Dictionary
    id2word = corpora.Dictionary(data_words)# Create Corpus
    # print("printing id2word")
    # print(id2word)
    texts = data_words
    corpus = [id2word.doc2bow(text) for text in texts]

    # Can use this to test word count!
    # for i in range(len(corpus[0])):
    #     print(f"Word {corpus[0][i][0]} [{id2word[corpus[0][i][0]]}] -- count: {corpus[0][i][1]} times")


    sentences = [' '.join(text) for text in data_words]
    # print("printing articles sentences")
    # print(sentences)

    # vectorizer = CountVectorizer(analyzer='word', max_features=2000)
    # x = vectorizer.fit_transform(sentences)

    # transformer = TfidfTransformer()
    # x_tfid = transformer.fit_transform(x)

    # x_tfid_norm = normalize(x_tfid, norm='l1', axis=1)

    tf_vectorizer = TfidfVectorizer(analyzer="word", max_features=2000, stop_words="english")
    x = tf_vectorizer.fit_transform(sentences)

    num_topics = NUM_TOPICS
    nmf_model = NMF(n_components=num_topics, init="nndsvd")
    nmf_model.fit(x)

    nmf_features = tf_vectorizer.get_feature_names()
    nmf_weights = nmf_model.components_
    get_NMF_topics(nmf_model, tf_vectorizer, 10, 9)
    plot_words(nmf_model, nmf_features, DATASET_TYPE, 10)

    # print("printing _weights")
    # print(nmf_model.components_)

    
if __name__ == "__main__":
    main()
