In [None]:
import pandas as pd
import langid
from os import listdir
from os import path
from os import walk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Testing on different review files

In [None]:
# Reading test input
test_files = {"Arts and Humanities": "~/Reviews/Arts_and_Humanities_Reviews/Creative Writing: The Craft of Character- Wesleyan University-reviews.csv",
                  "Data Science": "~/Reviews/Data_Science_Reviews/Big Data Modeling and Management Systems-University of California San Diego-reviews.csv",
                  "Business": "~/Reviews/Business_Reviews/Developing An Entrepreneurial Mindset: First Step Towards Success-Michigan State University-reviews.csv",
                  "Social Sciences": "~/Reviews/Social_Sciences_Reviews/Public Policy Challenges of the 21st Century-University of Virginia-reviews.csv"
                 }

rev_df = pd.read_csv(test_files["Data Science"])
reviews = list(rev_df["Review Text"])

In [None]:
# Test code from medium article: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(reviews)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(reviews)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

## Final topic modeling on all reviews

In [None]:
def lda(revs):
    no_features = 1000
    
    # Filter out reviews that are not in english
    reviews = []
    for rev in revs:
        try:
            lang = langid.classify(rev)
            if lang[0] == "en":
                # print("Review in Enlish is", rev)
                reviews.append(str(rev))
        except Exception as e:
            print(e)
    
    topic_dict = {}
    
    # We only perform this if the new list of reviews is not empty
    if len(reviews) != 0:
        try:
            # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
            tf_vectorizer = CountVectorizer(max_features=no_features, stop_words='english')
            tf = tf_vectorizer.fit_transform(reviews)
            tf_feature_names = tf_vectorizer.get_feature_names()

            no_topics = 10

            # Run LDA
            lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

            for topic_idx, topic in enumerate(lda.components_):
                topic_dict[topic_idx] = []
                for i in topic.argsort()[:-10 - 1:-1]:
                    topic_dict[topic_idx].append(tf_feature_names[i])
                    #print(tf_feature_names[i])

            #print(topic_dict)
        except Exception as e:
            print(e)
    else:
        print("No reviews in English!")
    
    return topic_dict

In [None]:
def main(): 
    dir_path = "~/Reviews/"

    # Read in CSV files, iterate through all folders
    for subdir, dirs, files in walk(dir_path):
        for file in files:
            filepath = path.join(subdir, file)
            if filepath.endswith(".csv"):
                # print("Analyzing: " + filepath)
                rev_df = pd.read_csv(filepath)
                revs = rev_df["Review Text"]
                
                # Doing LDA 
                results = lda(revs)
                
                # If we managed to get topics 
                if results: 
                    for idx, topics in results.items():
                        rev_df[f"Topic Group {idx}"] = pd.Series(topics)
                    rev_df.to_csv(filepath, index=False)
                else:
                    rev_df[f"Topic Group {0}"] = pd.Series(["No reviews in English!"])
                        

main()

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        for i in topic.argsort()[:-no_top_words - 1:-1]:
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
print("NMF Topic Analysis")
display_topics(nmf, tfidf_feature_names, no_top_words)
print()
print("LDA Topic Analysis")
display_topics(lda, tf_feature_names, no_top_words)