In [None]:
import psycopg2
import nltk
from wordcloud import WordCloud
import unicodedata
import pandas as pd
import pprint
import pickle
import re
import os
from nltk.corpus import wordnet
import time
from nltk.tokenize import RegexpTokenizer
wnl = nltk.WordNetLemmatizer()
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
dbname = ''
username = ''
host = ''
password = ''

conn = psycopg2.connect('dbname={} user={} host={} password={}'.format(dbname, username, host, password))
cur = conn.cursor()

In [None]:
cur.execute("""
    SELECT business.business_id, name, categories, review_text FROM business
    JOIN review ON business.business_id = review.business_id WHERE business.restaurant_dummy = 1
""")

cols = ['business_id', 'name', 'categories', 'review_text']

restaurant_sample = pd.DataFrame(cur.fetchall(), columns=cols)

In [None]:
restaurant_sample

In [None]:
def _process_review(text):
    def _create_stop_words():
        stops = nltk.corpus.stopwords.words('english')
    
        neg_stops = ['no',
         'nor',
         'not',
         'don',
         "don't",
         'ain',
         'aren',
         "aren't",
         'couldn',
         "couldn't",
         'didn',
         "didn't",
         'doesn',
         "doesn't",
         'hadn',
         "hadn't",
         'hasn',
         "hasn't",
         'haven',
         "haven't",
         'isn',
         "isn't",
         'mightn',
         "mightn't",
         'mustn',
         "mustn't",
         'needn',
         "needn't",
         'shan',
         "shan't",
         'shouldn',
         "shouldn't",
         'wasn',
         "wasn't",
         'weren',
         "weren't",
         "won'",
         "won't",
         'wouldn',
         "wouldn't",
         'but',
         "don'",
         "ain't"]

        common_nonneg_contr = ["could've",
        "he'd",
        "he'd've",
        "he'll",
        "he's",
        "how'd",
        "how'll",
        "how's",
        "i'd",
        "i'd've",
        "i'll",
        "i'm",
        "i've",
        "it'd",
        "it'd've",
        "it'll",
        "it's",
        "let's",
        "ma'am",
        "might've",
        "must've",
        "o'clock",
        "'ow's'at",
        "she'd",
        "she'd've",
        "she'll",
        "she's",
        "should've",
        "somebody'd",
        "somebody'd've",
        "somebody'll",
        "somebody's",
        "someone'd",
        "someone'd've",
        "someone'll",
        "someone's",
        "something'd",
        "something'd've",
        "something'll",
        "something's",
        "that'll",
        "that's",
        "there'd",
        "there'd've",
        "there're",
        "there's",
        "they'd",
        "they'd've",
        "they'll",
        "they're",
        "they've",
        "'twas",
        "we'd",
        "we'd've",
        "we'll",
        "we're",
        "we've",
        "what'll",
        "what're",
        "what's",
        "what've",
        "when's",
        "where'd",
        "where's",
        "where've",
        "who'd",
        "who'd've",
        "who'll",
        "who're",
        "who's",
        "who've",
        "why'll",
        "why're",
        "why's",
        "would've",
        "y'all",
        "y'all'll",
        "y'all'd've",
        "you'd",
        "you'd've",
        "you'll",
        "you're",
        "you've"]

        letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
          'u', 'v', 'w', 'x', 'y', 'z']
        
        ranks = ['st', 'nd', 'rd', 'th']
        
        for x in neg_stops:
            if x in stops:
                stops.remove(x)

        new_stops = stops + common_nonneg_contr + letters + ranks + [""] + ['us'] + [''] + ["'"]
        stops = list(set(new_stops))
        return stops

    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].lower()
        tag_dict = {"a": wordnet.ADJ,
                    "n": wordnet.NOUN,
                    "v": wordnet.VERB,
                    "r": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    def _clean_review(text):
        text = text.lower()
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
        tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
        filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
        stops = _create_stop_words()
        tokens = [token for token in filtered_tokens if token not in stops]
        tokens = [re.sub("'s", '', token) for token in tokens if re.sub("'s", '', token) != '']
        for i, token in enumerate(tokens):
            tokens[i] = wnl.lemmatize(token, pos= get_wordnet_pos(token))
        tokens = [token for token in tokens if token not in stops]
        return tokens
    
    return _clean_review(text)

In [None]:
def apply_on_column(data):
    data['restaurant_review_tokens'] = data['review_text'].apply(lambda x: _process_review(x))
    return data

In [None]:
restaurant_sample = restaurant_sample.sample(frac = 0.1)

In [None]:
start = time.time()
apply_on_column(restaurant_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(sample.shape[0], dur//60, dur%60))

In [None]:
def _make_long_str(tokens_col):
    long_list = [token for review in tokens_col.values for token in review]
    long_string = ' '.join(long_list)
    return long_string

In [None]:
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(_make_long_str(restaurant_sample['restaurant_review_tokens']))
# Visualize the word cloud
wordcloud.to_image()
wordcloud.to_file('./restaurants_sample_wordcloud.png')

In [None]:
cur.execute("""
    SELECT business.business_id, name, categories, review_text FROM business
    JOIN review ON business.business_id = review.business_id WHERE business.health_dummy = 1
""")

cols = ['business_id', 'name', 'categories', 'review_text']

health_sample = pd.DataFrame(cur.fetchall(), columns=cols)

In [None]:
def apply_on_column(data):
    data['health_review_tokens'] = data['review_text'].apply(lambda x: _process_review(x))
    return data

In [None]:
start = time.time()
apply_on_column(health_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(health_sample.shape[0], dur//60, dur%60))

In [None]:
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(_make_long_str(health_sample['health_review_tokens']))
# Visualize the word cloud
wordcloud.to_image()
wordcloud.to_file('./health_sample_wordcloud.png')

## See what tokens correlate with different star ratings

In [None]:
def dummy_fun(text):
    return text


#This second TF-IDF function takes our already tokenized reviews, so the column review_sample['review_tokens']
#This essentially means that we need to run our custom preprocessor _process_review on our review text in raw form
tfidf2 = TfidfVectorizer(
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
sublinear_tf=True, min_df=5, ngram_range=(1,2)) 

In [None]:
cur.execute("""
    SELECT * FROM review LIMIT 1000
""")

cols = ['review_id', 'user_id', 'business_id', 'stars', 'review_date', 'review_text', 'useful', 'funny', 'cool']

review_sample = pd.DataFrame(cur.fetchall(), columns=cols)

In [None]:
#Code to apply _process_review function on all review_text column and put tokens in new column titled 'review_tokens'
def apply_on_column(data):
    data['review_tokens'] = data['review_text'].apply(lambda x: _process_review(x))
    return data

In [None]:
#Get times for how long it takes to run apply_on_column function on review sample
start = time.time()
apply_on_column(review_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(review_sample.shape[0], dur//60, dur%60))

In [None]:
tokens_stars_df = review_sample[['stars', 'review_tokens']].sort_values('stars')
tokens_stars = dict(tokens_stars_df.values)

In [None]:
features = tfidf2.fit_transform(review_sample.review_tokens).toarray()
labels = review_sample.stars
features.shape

In [None]:
N = 2
for star, tokens in sorted(tokens_stars.items()):
    features_chi2 = chi2(features, labels == star)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf2.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("'{}' Star:".format(star))
    print("Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))