In [1]:
# import packages
import json, re, nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer

from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer

from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df = pd.read_csv('yelp.csv')

In [3]:
df['business_id'].unique()
counts=df['business_id'].value_counts()
counts

bZiIIUcpgxh8mpKMDhdqbA    232
oz882XuZCxajKo64Opgq_Q    188
H_RM2u1WWGU1HkKZrYq2Ow    187
jREzLrIEkc4jQKLfYMJ0gg    130
VPqWLp9kMiZEbctCebIZUA    115
                         ... 
lK0_P52uDUcMarPkhuIqog      1
4OfisvEJwSj8srvFhfKg2w      1
VpPu3bIsz9zTU5wUlfDfAA      1
MC6sg7z7pAUrJ3mWPCO4Jg      1
EDnrlLOt72MRuMaUEc9iBA      1
Name: business_id, Length: 8086, dtype: int64

In [4]:
# we want to eliminate the restaurant that has reviews less than 20.
df20 = df[df['business_id'].isin(counts[counts > 20].index)]
df20

df20['business_id'].value_counts()
print("There are ", len(df20['business_id'].value_counts()) , "restaurants left.")

There are  343 restaurants left.


In [5]:
result_list = {'id':[], 
        'summary_sentence':[], 
        'key_words':[]
              } 
df_result_list = pd.DataFrame(result_list) 
df_result_list

Unnamed: 0,id,summary_sentence,key_words


In [None]:
for i in range(len(df20['business_id'].unique())) :
        business1 = df20['business_id'].unique()[i]
        value1 = business1
        df_result_list.loc[i,"id"]=value1


        review_1 = df20[df20['business_id']==business1]['text'].tolist()
        #tokenize to single sentences
        from nltk.tokenize import sent_tokenize
        sentences = []
        for s in review_1:
            sentences.append(sent_tokenize(s))

        sentences = [y for x in sentences for y in x]

        text = " ".join(sentences)

        #Summary
        NUM_SENTS = 10
        summary = str()
        text= text

        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        sentences = summarizer(parser.document, NUM_SENTS)  # Summarize the document with 5 sentences
        for sentence in sentences:
            summary += (sentence.__unicode__())

        
        df_result_list.loc[i,"summary_sentence"]=summary

        # Criteria (To see if our summary is correct)
        score_1 = df20[df20['business_id']==business1]['stars']
        score_1.mean()

        nlp = spacy.load('en_core_web_sm')
        doc = nlp(summary)

        lower = False
        candidate_pos = ['NOUN', 'ADJ', 'ADV','CONJ']
        sentences = []

        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)

        vocab = OrderedDict()
        k = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = k
                    k += 1

        window_size = 8

        token_pairs = list()
        for sentence in sentences:
            for w, word in enumerate(sentence):
                for z in range(w + 1, w + window_size):
                    if z >= len(sentence): 
                        break
                    pair = (word, sentence[z])
                    if pair not in token_pairs:
                        token_pairs.append(pair)

        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            f,v = vocab[word1], vocab[word2]
            g[f][v] = 1

        # Get Symmeric matrix
        g = g + g.T - np.diag(g.diagonal())

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm

        d = 0.85 # damping coefficient, usually is .85
        threshold = 1e-5 # convergence threshold
        steps = 10 # iteration steps
        node_weight = None # save keywords and its weight

        # Initialization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(steps):
            pr = (1 - d) + d * np.dot(g_norm, pr)
            if abs(previous_pr - sum(pr))  < threshold:
                break
            else:
                previous_pr = sum(pr)

        NUM_KEYWORDS = 10

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        node_weight = OrderedDict(sorted(node_weight.items(), key=lambda t: t[1], reverse=True))
        key_dics = str()
        for j, (key, value) in enumerate(node_weight.items()):
            value = round(value,1)
            key_dics = key_dics + key + ' - ' + str(value)
            if j > NUM_KEYWORDS:
                break   

        df_result_list.loc[i,"key_words"]=key_dics
        
print("finished")

In [None]:
df_result_list

In [None]:
df_result_list.to_csv('yelp_result.csv')