In [13]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.collocations import *
from collections import Counter



In [14]:
def concat_data(all_reviews):
    full_text = ''
    for i in range(len(all_reviews)):
        full_text += all_reviews.iloc[i]['Review'] + '. ' 
    return full_text

In [15]:
def preprocessing_data(full_text):
    tokenizer = RegexpTokenizer(r'\w+')
    lemm = WordNetLemmatizer()

    sentences = sent_tokenize(full_text)
    clean_sentences = sent_tokenize(full_text)
    for i in range(len(clean_sentences)):
        clean_sentences[i] = lemm.lemmatize(clean_sentences[i])

    tokens = tokenizer.tokenize(re.sub("[^-9A-Za-z ]", "" , full_text))
    tokens = [lemm.lemmatize(token.lower()) for token in tokens]


    sr= stopwords.words('english')
    clean_tokens = [token for token in tokens if token not in sr]

    return clean_sentences, clean_tokens, sentences

In [16]:
def adjacence_matrix(tokens):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    finder.nbest(bigram_measures.pmi, 10)  

    scored = finder.score_ngrams(bigram_measures.raw_freq)
    sor = sorted(bigram for bigram, score in scored)  

    return sor


In [17]:
def next_step(word, sorted_ad_matrix):
    ste_count = Counter(elem for elem in sorted_ad_matrix if elem[0] == word) + Counter(elem for elem in sorted_ad_matrix if elem[1] == word)
    p = np.array(list(ste_count.values())) 
    p = p/sum(p)

    temp = list(ste_count)[np.random.choice(len(ste_count), p=p)]
    if temp[0] == word:
        return temp[1]
    return temp[0]

In [18]:
def path_extractor(sorted_ad_matrix, n_words, phrases_per_word, in_words):
    answer =[]
    for in_word in in_words:
        for j in range(phrases_per_word):
            proto_phrase = ''
            last_word = in_word
            for i in range(n_words):
                proto_phrase += last_word + ' '
                last_word = next_step(last_word, sorted_ad_matrix)
            answer.append(proto_phrase)
    return answer

In [19]:
def final_mapper(phrases, sentences, original_sentences):
    tokenizer = RegexpTokenizer(r'\w+')
    lemm = WordNetLemmatizer()
    best_sentences = []
    for proto_phrase in phrases:
        rank = []
        for i in range(len(sentences)):
            num_present = 0
            sentence = sentences[i]
            toki = tokenizer.tokenize(re.sub("[^-9A-Za-z ]", "" , sentence))
            toki = [lemm.lemmatize(toks.lower()) for toks in toki]
            for proto_word in proto_phrase.split(' '):
                if proto_word in toki:
                    num_present +=1
            rank.append(num_present)
        best_sentences.append(original_sentences[rank.index(max(rank))])

    #print(max(rank))
    return best_sentences

In [20]:
def full_extraction(full_text, n_words, phrases_per_word, in_words):
    sentences, tokens, original_sentences = preprocessing_data(full_text)
    sorted_ad_matrix = adjacence_matrix(tokens)
    proto_phrases = path_extractor(sorted_ad_matrix, n_words, phrases_per_word, in_words)
    selected_sentences = final_mapper(proto_phrases, sentences, original_sentences)
    return selected_sentences

In [21]:
data = pd.read_json('data/factual_tripadvisor_restaurant_data_all_100_reviews.json')

data['restaurants'].iloc[0]['reviews'][1]['review_text']


'We went to the downtown SF location. The restaurant was really clean and servers were nice!Foods were great! We had a Burger and Ruben Sandwich! Delicious! We ordered a flight to taste 6 different beers! We enjoyed their brown ale and stout!We recommend this place for friends gathering!'

In [22]:
full_text = ''
for text in data['restaurants'].iloc[0]['reviews']:
    full_text += text['review_text'] + '. '

In [23]:
selected_sentences = full_extraction(full_text, 4, 2, ['food'])
print(selected_sentences)
print(data['restaurants'].iloc[0]['rating'])

["Nothing spectacular but I'd go back.. Mediocre food (not bad, just mediocre, you can find the same food in pretty much every bar in the US, cooked just as well if not better).Service is so so and it's incredibly slow.", 'Outdoor beer garden with benches outside on closed De BOOM street.']
4.0


In [24]:
print(selected_sentences[0])

Nothing spectacular but I'd go back.. Mediocre food (not bad, just mediocre, you can find the same food in pretty much every bar in the US, cooked just as well if not better).Service is so so and it's incredibly slow.
