Venyatha Manne

# Task#1: Corpus collection and Corpus Descriptive analysis

In [1]:
import numpy as np
import pandas as pd
import string
from pprint import pprint
import time
from tabulate import tabulate

In [2]:
data = pd.read_csv("ratemd.25k.all.txt", sep="\n", header=None)

# make a list that is : Doctor | Gender | Location | Specializations | (list of) Reviews
def make_complete_list(data):
    data = data.values.tolist()
    data_arr = []
    curr_reviews = []
    last_splitted = []
    l = data[0][0].split("\t")

    for line in data[1:]:    
        splitted = line[0].split("\t")
        length = len(splitted)
        
        # if splitted is a doctor header line
        if length == 4: 
            
            # ignore doctors with no reviews
            if len(curr_reviews) == 0:
                continue

            l.append(curr_reviews)
            data_arr.append(l)
            curr_reviews = []
            l = splitted

            
        # if splitted is a review
        if length == 2:
            # ignore reviews that don't have a written review, regardless of whether they have a rating
            if (splitted[1] != " "):
                curr_reviews.append(splitted)

                
    # add final entry
    l.append(curr_reviews)
    data_arr.append(l)
    
    return data_arr

data_list = make_complete_list(data)


In [3]:
# a review > 3 is positive, else it is negative
def calc_sentiment(review):
    value = float(review[0].split(" ")[2])
    if value > 3:
        return 1
    else:
        return 0
    
# prints out the counts and percentages of positive and negative reviews categorized by gender
# returns a sentiment list corresponding to every review (will be used to determine collections for ccLDA)
def count_sentiment(data_list):
    f_pos = 0
    f_neg = 0
    m_pos = 0
    m_neg = 0
    
    # updated for use in CCLDA for part 2
    sentiment_list = [] # 1 for pos, 0 for neg
    gender_list = [] # 1 for female, 0 for neg

    for item in data_list:        
        if item[1].strip().lower() == "female":
            for i in range(len(item[4])):
                sentiment = calc_sentiment(item[4][i])
                sentiment_list.append(sentiment)
                gender_list.append(1)
                if sentiment == 1:
                    f_pos += 1
                else:
                    f_neg += 1

        elif item[1].strip().lower() == "male":
            for i in range(len(item[4])):
                sentiment = calc_sentiment(item[4][i])
                sentiment_list.append(sentiment)
                gender_list.append(0)
                if sentiment == 1:
                    m_pos += 1
                else:
                    m_neg += 1
            
    # calculate percentages:
    total = f_pos + f_neg + m_pos + m_neg
    f_pos_p =  (f_pos / (f_pos + f_neg))*100
    m_pos_p =  (m_pos /  (m_pos + m_neg))*100
    f_neg_p = (f_neg / (f_pos + f_neg))*100
    m_neg_p = (m_neg / (m_pos + m_neg))*100
    
    f_total = "{:.2f}".format(((f_pos+f_neg)/total)*100)
    m_total = "{:.2f}".format(((m_pos+m_neg)/total)*100)
    
    
    # print out values for table        
    print("Female Positive = ", f_pos, ",", "{:.2f}".format(f_pos_p),"%")
    print("Female Negative = ", f_neg, ",","{:.2f}".format(f_neg_p),"%")
    print("Male Positive = ", m_pos, ",","{:.2f}".format(m_pos_p),"%")
    print("Male Negative = ", m_neg, ",","{:.2f}".format(m_neg_p ),"%")
    
    print("Total Female = ", f_pos+f_neg,",", f_total,"%")    
    print("Total Male = ", m_pos+m_neg,",", m_total,"%")
    return sentiment_list, gender_list
    
# used in Task 2b
sentiment_list, gender_list = count_sentiment(data_list)

Female Positive =  2859 , 62.02 %
Female Negative =  1751 , 37.98 %
Male Positive =  10354 , 68.84 %
Male Negative =  4686 , 31.16 %
Total Female =  4610 , 23.46 %
Total Male =  15040 , 76.54 %


In [4]:
# prints the length of the shortest and longest review as well as the average length of a review
# A review is defined as the number of tokens (i.e., any sequence of characters separated by space and/or beginning/end of review). 
def get_review_lengths():
    all_reviews = []
    
    
    for item in data_list:
        reviews_list = item[4]
        
        for i in range(len(reviews_list)):
            review = reviews_list[i][1]
            length = len(review.split())
            if length > 0:
                all_reviews.append(length)
    return all_reviews
            
reviews = get_review_lengths()
print("Longest review:", max(reviews))
print("Shortest review:", min(reviews))
print("Average length:", "{:.0f}".format((sum(reviews)/len(reviews))))

Longest review: 899
Shortest review: 1
Average length: 65


# Task#2a: Exploratory Analysis of Corpus with LDA

In [5]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mannev1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Problem#1: Without lemmatization

In [6]:
# Step 1: Clean Corpus
# make list of all the reviews     
# clean the text: punctuation, lowercase, stopwords
def clean_text(speech):
    text = speech
    text = text.lower()
    text = re.sub('—\[.*?\]', '', text)
    text = re.sub('—', ' ', text)
    text = re.sub('“', '', text)
    text = re.sub('”', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('[\d\n]', ' ', text)
    
    word_tokens = word_tokenize(text)
    stopwords_ = set(stopwords.words('english'))
    text = [word.lower() for word in word_tokens
             if len(word) > 2 
             and word not in stopwords_]
    return text   

all_reviews = []
for item in data_list:
    review_list = item[4]
    for review in review_list:
        r = review[1]
        r = "".join(r)
        all_reviews.append(clean_text(r))

# Step 2: Create the dictionary
id2word = corpora.Dictionary(all_reviews)

# further preprocessing: have occurred in less than 4 articles or have occurred in more than 40% of the articles
id2word.filter_extremes(no_below=4, no_above=0.4)
term_dictionary = id2word.token2id
print("size of dictionary: ", len(term_dictionary))


# Step 3: Convert the list of documents in your corpus into Document-Term Matrix
doc_term_matrix = [id2word.doc2bow(text) for text in all_reviews]


size of dictionary:  7972


In [22]:
# Step 4: Run the LDA model on the document-term matrix
# Set 1: number of topics (k = 10), number of passes (pass = 20), and number of iterations (iterations = 2000).
start_time = time.time()
lda_model = gensim.models.LdaMulticore(corpus=doc_term_matrix, id2word=id2word, num_topics=10, passes=20, iterations=2000)
print("Problem 1, Set 1","--- %s seconds ---" % (time.time() - start_time))

# Set 2: number of topics (k = 20), number of passes (pass = 20), and number of iterations (iterations = 2000).
start_time = time.time()
lda_model2 = gensim.models.LdaMulticore(corpus=doc_term_matrix, id2word=id2word, num_topics=20, passes=20, iterations=2000)
print("Problem 1, Set 2","--- %s seconds ---" % (time.time() - start_time))

Problem 1, Set 1 --- 130.4043300151825 seconds ---
Problem 1, Set 2 --- 132.97442770004272 seconds ---


In [23]:
def print_tables(topics, title, num_topics):
    table = []
    for topic in topics:
        word_list = []
        for word in topic[0]:
            word_list.append(word[1])
        table.append(word_list)
    
    print(title)
    dict1 = {'Topic 1': table[0], 'Topic 2': table[1], 'Topic 3': table[2], 'Topic 4': table[3], 'Topic 5': table[4]}
    df1 = pd.DataFrame(dict1)
    print(tabulate(df1, headers = 'keys', tablefmt = 'psql'))
    
    dict2 = {'Topic 6': table[5], 'Topic 7': table[6], 'Topic 8': table[7], 'Topic 9': table[8], 'Topic 10': table[9]}
    df2 = pd.DataFrame(dict2)
    print(tabulate(df2, headers = 'keys', tablefmt = 'psql'))
    
    if (num_topics == 20):
        dict1 = {'Topic 11': table[10], 'Topic 12': table[11], 'Topic 13': table[12], 'Topic 14': table[13], 'Topic 15': table[14]}
        df1 = pd.DataFrame(dict1)
        print(tabulate(df1, headers = 'keys', tablefmt = 'psql'))

        dict2 = {'Topic 16': table[15], 'Topic 17': table[16], 'Topic 18': table[17], 'Topic 19': table[18], 'Topic 20': table[19]}
        df2 = pd.DataFrame(dict2)
        print(tabulate(df2, headers = 'keys', tablefmt = 'psql'))
        

top_topics = lda_model.top_topics(doc_term_matrix, topn=10)
print_tables(top_topics, "Problem 1, Set 1:", 10)
top_topics2 = lda_model2.top_topics(doc_term_matrix, topn=10)
print_tables(top_topics2, "Problem 1, Set 2:", 20)

Problem 1, Set 1:
+----+-----------+-----------+-----------+-------------+-------------+
|    | Topic 1   | Topic 2   | Topic 3   | Topic 4     | Topic 5     |
|----+-----------+-----------+-----------+-------------+-------------|
|  0 | doctor    | doctor    | doctor    | office      | time        |
|  1 | patients  | time      | office    | staff       | room        |
|  2 | rude      | great     | told      | doctor      | doctor      |
|  3 | like      | best      | insurance | time        | see         |
|  4 | ever      | recommend | would     | wait        | appointment |
|  5 | care      | always    | said      | get         | minutes     |
|  6 | never     | staff     | called    | always      | waiting     |
|  7 | would     | feel      | get       | call        | wait        |
|  8 | patient   | would     | asked     | never       | went        |
|  9 | one       | questions | never     | appointment | never       |
+----+-----------+-----------+-----------+-------------+---

## Problem#2: With lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


In [11]:
# # Lemmatize

# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

start_time = time.time()
lemmatized_reviews = []
for review in all_reviews:
    sentence = " ".join(review)
    lemmatized_reviews.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
lemma_time = time.time() - start_time
print("Time taken to lemmatize:","--- %s seconds ---" % (lemma_time))


Time taken to lemmatize: --- 125.15052008628845 seconds ---


In [18]:
# Step 2: Create the dictionary
id2word_l = corpora.Dictionary(lemmatized_reviews)

# further preprocessing: have occurred in less than 4 articles or have occurred in more than 40% of the articles
id2word_l.filter_extremes(no_below=4, no_above=0.4)
term_dictionary_l = id2word.token2id
print("size of dictionary: ", len(term_dictionary_l))


# Step 3: Convert the list of documents in your corpus into Document-Term Matrix
doc_term_matrix_l = [id2word_l.doc2bow(text) for text in lemmatized_reviews]

# Step 4: Run the LDA model on the document-term matrix
# Set 1: number of topics (k = 10), number of passes (pass = 20), and number of iterations (iterations = 2000).
start_time = time.time()
lda_model_l = gensim.models.LdaMulticore(corpus=doc_term_matrix_l, id2word=id2word_l, num_topics=10, passes=20, iterations=2000)
ldatime = time.time() - start_time
print("Problem 1, Set 1","--- %s seconds ---" % (ldatime))
print("Time with lemmatization %s" % (ldatime+lemma_time))
print("\n")



# Set 2: number of topics (k = 20), number of passes (pass = 20), and number of iterations (iterations = 2000).
start_time = time.time()
lda_model2_l = gensim.models.LdaMulticore(corpus=doc_term_matrix_l, id2word=id2word_l, num_topics=20, passes=20, iterations=2000)
ldatime = time.time() - start_time
print("Problem 1, Set 2","--- %s seconds ---" % (ldatime))
print("Time with lemmatization %s" % (ldatime+lemma_time))


top_topics_l = lda_model_l.top_topics(doc_term_matrix_l, topn=10)
print_tables(top_topics_l, "Problem 1, Set 1:",10)
top_topics2_l = lda_model2_l.top_topics(doc_term_matrix_l, topn=10)
print_tables(top_topics2_l, "Problem 1, Set 2:",20)

size of dictionary:  7972
Problem 1, Set 1 --- 141.8912582397461 seconds ---
Time with lemmatization 267.04177832603455


Problem 1, Set 2 --- 133.83016920089722 seconds ---
Time with lemmatization 258.98068928718567
Problem 1, Set 1:
+----+-----------+-----------+-------------+-----------+-----------+
|    | Topic 1   | Topic 2   | Topic 3     | Topic 4   | Topic 5   |
|----+-----------+-----------+-------------+-----------+-----------|
|  0 | call      | pain      | wait        | time      | office    |
|  1 | get       | go        | time        | child     | staff     |
|  2 | say       | say       | appointment | take      | rude      |
|  3 | would     | would     | see         | like      | patient   |
|  4 | told      | told      | hour        | feel      | call      |
|  5 | back      | get       | get         | son       | insurance |
|  6 | day       | want      | minute      | care      | get       |
|  7 | see       | back      | room        | see       | bad       |
|  8 |

# Task#2b: Exploratory Analysis of Corpus with ccLDA 

In [24]:
# cclda data preperation
# split into collections and write to a txt file
# 0 = f_pos, 1 = f_neg, 2 = m_pos, 3 = m_neg

reviews_ = lemmatized_reviews # switch out lemmatized_reviews for all_reviews if you want to run without lemmatization

file = open("input_file.txt","w")

for i in range(len(sentiment_list)):    
    if gender_list[i] == 1: # female
        if sentiment_list[i] == 1:
            file.write("0 " + " ".join(reviews_[i]) + "\n") #f_pos
        else:
            file.write("1 " + " ".join(reviews_[i]) + "\n")
    else:
        if sentiment_list[i] == 1:
            file.write("2 " + " ".join(reviews_[i]) + "\n") #f_pos
        else:
            file.write("3 " + " ".join(reviews_[i]) + "\n")
file.close()
            
