In [1]:
import os
import re
import json
import nltk

import pandas as pd

In [None]:
path = os.getcwd()
print("Current Directory", path)
print()

In [3]:
dataset_path = path + '/dataset/Full Dataset/'

dataset_final = pd.read_json (dataset_path + 'dataset_total.json', encoding = 'utf-8')

print('Total Samples:', len(dataset_final))

Total Samples: 161541


In [4]:
display(dataset_final.sample(5))

Unnamed: 0,user_review_posted,user_total_helpful_votes,expertise,user_cities_visited,review_days,helpful_class,review_text
34908,18,10,0.002587,102,0.200438,1,My family and I stayed for one night and were ...
100113,21,4,0.000814,42,0.065717,0,"So, I parked my car at the double tree. Gave ..."
1736,68,20,0.001318,225,0.850493,0,I got a great deal to stay here & although I h...
32231,300,78,0.001152,41,0.401972,1,I stayed here for 10 nights and it became a ho...
21285,9,5,0.002587,5,0.457831,0,The Argonaut is well positioned on Fishermans ...


In [5]:
dataset_final.groupby(['helpful_class']).size()

helpful_class
0    101403
1     47306
2     11170
3      1432
4       230
dtype: int64

In [6]:
a = dataset_final[dataset_final.helpful_class == 0].sample(230)
b = dataset_final[dataset_final.helpful_class == 1].sample(230)
c = dataset_final[dataset_final.helpful_class == 2].sample(230)
d = dataset_final[dataset_final.helpful_class == 3].sample(230)
e = dataset_final[dataset_final.helpful_class == 4].sample(230)

## Preprocessing

In [None]:
import string

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

en_stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
lexicon_path = path + '/Lexicons/'

def getStopWordSets():
    # NLTK stopwords
    stop_words = list(stopwords.words("english"))
    
    stop_words.append('hotel')
    stop_words.append('restaurant')
    stop_words.append('night')
    stop_words.append('day')
    
    # filter out the sentimental and emoticons from the keyphrases
    df_lexicon = pd.read_csv(lexicon_path + 'vader_lexicon.txt', delimiter='\t')
    stop_words += list(df_lexicon['Lexicon'])
    
    df_emoji = pd.read_csv(lexicon_path + 'emoji_utf8_lexicon.txt', delimiter='\t')
    stop_words += list(df_emoji['Emotion'])
    
    return stop_words

In [9]:
import itertools

def extract_candidate_words(text):
    # allowed POS tags
    good_tags = set(['NN','NNS'])
    
    # exclude candidates that are stop words
    stop_words = getStopWordSets()
    
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                   for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and word.isalpha() and len(word) > 2]
    candidates = [lemmatizer.lemmatize(word) for word in candidates]
    candidates = [lemmatizer.lemmatize(word, pos='v') for word in candidates]
    return " ".join(candidates)

In [10]:
print(extract_candidate_words("We investigate vision-and-language models that take as input the cartoon pixels and caption directly, as well as language-only models for which we circumvent image-processing by providing textual descriptions of the image."))

model input cartoon pixel caption model description image


## Unigram, Bigram, and Trigram Finder

In [11]:
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('NN', 'NNS')
    second_type = ('NN', 'NNS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False
    if ngram[0] == ngram[1]:
        return False
    else:
        return True

In [12]:
#function to filter for trigrams
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('NN', 'NNS')
    third_type = ('NN', 'NNS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [13]:
def trigramFinder(filtered_words):
    tokens = nltk.word_tokenize(filtered_words)
    trigrams = nltk.collocations.TrigramAssocMeasures()
    
    trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
    
    #bigramFinder.apply_freq_filter(20)
    trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), 
                                                   columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)
    filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]
    filtered_frame = filteredLik_tri['trigram'][:10]
  
    aspects_list = filtered_frame.values.tolist()
  
    trigram_aspects_list = []
    
    for aspect in aspects_list:
        trigram_aspects_list.append(aspect[0] + ' ' + aspect[1] + ' ' + aspect[2])
    
    return trigram_aspects_list

In [14]:
def bigramFinder(filtered_words):
    tokens = nltk.word_tokenize(filtered_words)
    bigrams = nltk.collocations.BigramAssocMeasures()
    
    bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
    
    #bigramFinder.apply_freq_filter(20)
    bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), 
                                              columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)
    filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]
    filtered_frame = filteredLik_bi['bigram'][:10]
  
    aspects_list = filtered_frame.values.tolist()
  
    bigram_aspects_list = []
    
    for aspect in aspects_list:
        bigram_aspects_list.append(aspect[0] + ' ' + aspect[1])
    
    return bigram_aspects_list

In [15]:
def unigramFinder(filtered_words):
    
    tokens = nltk.word_tokenize(filtered_words)
    
    # Calculate frequency distribution
    fdist = nltk.FreqDist(tokens)
    
    unigram_aspects_list = []
    
    # Output top 30 words
    for word, frequency in fdist.most_common(20):
        unigram_aspects_list.append(word)
    
    return unigram_aspects_list

## Find Top Grams

In [16]:
def findTOPgrams(list_sentences):
    
    sentences = " ".join(list_sentences)
    filtered_words = extract_candidate_words(sentences)
    
    unigram_aspects = unigramFinder(filtered_words)
    bigram_aspects = bigramFinder(filtered_words)
    trigram_aspects = trigramFinder(filtered_words)
    
    return unigram_aspects, bigram_aspects, trigram_aspects

In [17]:
print("CLASS 0")
print("-------------------------------")
class_0_text = list(a['review_text'])

unigram_aspects, bigram_aspects, trigram_aspects = findTOPgrams(class_0_text)

print(unigram_aspects)
print(bigram_aspects)
print(trigram_aspects)

print("CLASS 1")
print("-------------------------------")
class_1_text = list(b['review_text'])

unigram_aspects, bigram_aspects, trigram_aspects = findTOPgrams(class_1_text)

print(unigram_aspects)
print(bigram_aspects)
print(trigram_aspects)

print("CLASS 2")
print("-------------------------------")
class_2_text = list(c['review_text'])

unigram_aspects, bigram_aspects, trigram_aspects = findTOPgrams(class_2_text)

print(unigram_aspects)
print(bigram_aspects)
print(trigram_aspects)

print("CLASS 3")
print("-------------------------------")
class_3_text = list(d['review_text'])

unigram_aspects, bigram_aspects, trigram_aspects = findTOPgrams(class_3_text)

print(unigram_aspects)
print(bigram_aspects)
print(trigram_aspects)

print("CLASS 4")
print("-------------------------------")
class_4_text = list(e['review_text'])

unigram_aspects, bigram_aspects, trigram_aspects = findTOPgrams(class_4_text)

print(unigram_aspects)
print(bigram_aspects)
print(trigram_aspects)

CLASS 0
-------------------------------
['room', 'staff', 'location', 'time', 'service', 'breakfast', 'stay', 'bed', 'pool', 'view', 'food', 'place', 'area', 'desk', 'front', 'bar', 'bathroom', 'night', 'floor', 'family']
['front desk', 'coffee maker', 'breakfast buffet', 'sofa bed', 'swim pool', 'eye contact', 'make facility', 'valet park', 'partner bottom', 'cable car']
['case front desk', 'front desk person', 'rate front desk', 'atrium front desk', 'front desk clerk', 'front desk costume', 'front desk lack', 'front desk monday', 'front desk personnel', 'door front desk']
CLASS 1
-------------------------------
['room', 'staff', 'service', 'location', 'time', 'pool', 'breakfast', 'stay', 'bed', 'place', 'view', 'desk', 'food', 'price', 'front', 'bathroom', 'area', 'bar', 'floor', 'property']
['front desk', 'shampoo conditioner', 'customer service', 'resort fee', 'pool area', 'park garage', 'coffee maker', 'king bed', 'skytrain station', 'bottle water']
['service front desk', 'line fr

## Save the Analysis Dataframe

In [None]:
print(path)

In [26]:
full_data_frame = pd.concat([a, b, c, d, e], axis=0)

with open(dataset_path + "analysis_final.json", 'w', encoding='utf-8') as file:
    file.write(json.dumps(full_data_frame.to_dict('records'), indent=4))