In [None]:
import psycopg2
import nltk
import unicodedata
import pandas as pd
import pprint
import pickle
import re
import os
from nltk.corpus import wordnet
import time
from nltk.tokenize import RegexpTokenizer
wnl = nltk.WordNetLemmatizer()
nltk.download('averaged_perceptron_tagger')
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import seaborn as sns

In [None]:
#Input your PostGres credentials to connect

dbname = ''
username = ''
host = ''
password = ''

conn = psycopg2.connect('dbname={} user={} host={} password={}'.format(dbname, username, host, password))
cur = conn.cursor()

In [None]:
#Adjust the sample size by changing the number of instances you request following LIMIT

cur = conn.cursor()
cur.execute("""
    SELECT * FROM review LIMIT 100
""")

cols = ['review_id', 'user_id', 'business_id', 'stars', 'review_date', 'review_text', 'useful', 'funny', 'cool']

review_sample = pd.DataFrame(cur.fetchall(), columns=cols)

In [None]:
#make sure you got the sample
review_sample

In [None]:
#View specific instance
print(review_sample.loc[9, 'review_text'])

In [None]:
"""
#Function to create customized stopword list that retains words with negative connotation and removes common, non-negative contrations
def _create_stop_words():

    stops = nltk.corpus.stopwords.words('english')
    
    neg_stops = ['no',
     'nor',
     'not',
     'don',
     "don't",
     'ain',
     'aren',
     "aren't",
     'couldn',
     "couldn't",
     'didn',
     "didn't",
     'doesn',
     "doesn't",
     'hadn',
     "hadn't",
     'hasn',
     "hasn't",
     'haven',
     "haven't",
     'isn',
     "isn't",
     'mightn',
     "mightn't",
     'mustn',
     "mustn't",
     'needn',
     "needn't",
     'shan',
     "shan't",
     'shouldn',
     "shouldn't",
     'wasn',
     "wasn't",
     'weren',
     "weren't",
     "won'",
     "won't",
     'wouldn',
     "wouldn't",
     'but',
     "don'",
     "ain't"]

    common_nonneg_contr = ["could've",
    "he'd",
    "he'd've",
    "he'll",
    "he's",
    "how'd",
    "how'll",
    "how's",
    "i'd",
    "i'd've",
    "i'll",
    "i'm",
    "i've",
    "it'd",
    "it'd've",
    "it'll",
    "it's",
    "let's",
    "ma'am",
    "might've",
    "must've",
    "o'clock",
    "'ow's'at",
    "she'd",
    "she'd've",
    "she'll",
    "she's",
    "should've",
    "somebody'd",
    "somebody'd've",
    "somebody'll",
    "somebody's",
    "someone'd",
    "someone'd've",
    "someone'll",
    "someone's",
    "something'd",
    "something'd've",
    "something'll",
    "something's",
    "that'll",
    "that's",
    "there'd",
    "there'd've",
    "there're",
    "there's",
    "they'd",
    "they'd've",
    "they'll",
    "they're",
    "they've",
    "'twas",
    "we'd",
    "we'd've",
    "we'll",
    "we're",
    "we've",
    "what'll",
    "what're",
    "what's",
    "what've",
    "when's",
    "where'd",
    "where's",
    "where've",
    "who'd",
    "who'd've",
    "who'll",
    "who're",
    "who's",
    "who've",
    "why'll",
    "why're",
    "why's",
    "would've",
    "y'all",
    "y'all'll",
    "y'all'd've",
    "you'd",
    "you'd've",
    "you'll",
    "you're",
    "you've"]

    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
          'u', 'v', 'w', 'x', 'y', 'z']
        
    ranks = ['st', 'nd', 'rd', 'th']
    
    for x in neg_stops:
        if x in stops:
            stops.remove(x)
        
    new_stops = stops + common_nonneg_contr + letters + ranks + [""] + ['us'] + [''] 
    stops = list(set(new_stops))
    return stops
    
"""

In [None]:
"""
#The if len(word) > 0 check is still not sufficient.. as it will leave in '' tokens

def get_wordnet_pos(word):
    #Added in this line because originally broke when trying to pass through '', which occured when there was
    #a token like '2's' that got reduced to "'s" and then '' before being passed through lemmatizer
    if len(word) > 0:
        tag = nltk.pos_tag([word])[0][1][0].lower()
        tag_dict = {"a": wordnet.ADJ,
                    "n": wordnet.NOUN,
                    "v": wordnet.VERB,
                    "r": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    else:
        return wordnet.NOUN

def _clean_review(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
    tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
    filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
    stops = _create_stop_words()
    tokens = [token for token in filtered_tokens if token not in stops]
    for i, token in enumerate(tokens):
        filtered_token = re.sub("'s", '', token)
        tokens[i] = wnl.lemmatize(filtered_token, pos= get_wordnet_pos(filtered_token))
    return tokens

"""

In [None]:
"""
def get_wordnet_pos2(word):
    tag = nltk.pos_tag([word])[0][1][0].lower()
    tag_dict = {"a": wordnet.ADJ,
                "n": wordnet.NOUN,
                "v": wordnet.VERB,
                "r": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def _clean_review2(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
    tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
    filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
    stops = _create_stop_words()
    tokens = [token for token in filtered_tokens if token not in stops]
    tokens = [re.sub("'s", '', token) for token in tokens if re.sub("'s", '', token) != '']
    for i, token in enumerate(tokens):
        tokens[i] = wnl.lemmatize(token, pos= get_wordnet_pos2(token))
    tokens = [token for token in tokens if token != '']
    return tokens
    
"""

In [None]:
def _process_review(text):
    def _create_stop_words():
        stops = nltk.corpus.stopwords.words('english')
    
        neg_stops = ['no',
         'nor',
         'not',
         'don',
         "don't",
         'ain',
         'aren',
         "aren't",
         'couldn',
         "couldn't",
         'didn',
         "didn't",
         'doesn',
         "doesn't",
         'hadn',
         "hadn't",
         'hasn',
         "hasn't",
         'haven',
         "haven't",
         'isn',
         "isn't",
         'mightn',
         "mightn't",
         'mustn',
         "mustn't",
         'needn',
         "needn't",
         'shan',
         "shan't",
         'shouldn',
         "shouldn't",
         'wasn',
         "wasn't",
         'weren',
         "weren't",
         "won'",
         "won't",
         'wouldn',
         "wouldn't",
         'but',
         "don'",
         "ain't"]

        common_nonneg_contr = ["could've",
        "he'd",
        "he'd've",
        "he'll",
        "he's",
        "how'd",
        "how'll",
        "how's",
        "i'd",
        "i'd've",
        "i'll",
        "i'm",
        "i've",
        "it'd",
        "it'd've",
        "it'll",
        "it's",
        "let's",
        "ma'am",
        "might've",
        "must've",
        "o'clock",
        "'ow's'at",
        "she'd",
        "she'd've",
        "she'll",
        "she's",
        "should've",
        "somebody'd",
        "somebody'd've",
        "somebody'll",
        "somebody's",
        "someone'd",
        "someone'd've",
        "someone'll",
        "someone's",
        "something'd",
        "something'd've",
        "something'll",
        "something's",
        "that'll",
        "that's",
        "there'd",
        "there'd've",
        "there're",
        "there's",
        "they'd",
        "they'd've",
        "they'll",
        "they're",
        "they've",
        "'twas",
        "we'd",
        "we'd've",
        "we'll",
        "we're",
        "we've",
        "what'll",
        "what're",
        "what's",
        "what've",
        "when's",
        "where'd",
        "where's",
        "where've",
        "who'd",
        "who'd've",
        "who'll",
        "who're",
        "who's",
        "who've",
        "why'll",
        "why're",
        "why's",
        "would've",
        "y'all",
        "y'all'll",
        "y'all'd've",
        "you'd",
        "you'd've",
        "you'll",
        "you're",
        "you've"]

        letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
          'u', 'v', 'w', 'x', 'y', 'z']
        
        ranks = ['st', 'nd', 'rd', 'th']
        
        for x in neg_stops:
            if x in stops:
                stops.remove(x)

        new_stops = stops + common_nonneg_contr + letters + ranks + [""] + ['us'] + ['']
        stops = list(set(new_stops))
        return stops

    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].lower()
        tag_dict = {"a": wordnet.ADJ,
                    "n": wordnet.NOUN,
                    "v": wordnet.VERB,
                    "r": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    def _clean_review(text):
        text = text.lower()
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
        tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
        filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
        stops = _create_stop_words()
        tokens = [token for token in filtered_tokens if token not in stops]
        tokens = [re.sub("'s", '', token) for token in tokens if re.sub("'s", '', token) != '']
        for i, token in enumerate(tokens):
            tokens[i] = wnl.lemmatize(token, pos= get_wordnet_pos(token))
        tokens = [token for token in tokens if token != '' and token not in stops]
        return tokens
    
    return _clean_review(text)

In [None]:
"""
#Code to apply _clean_review function on all review_text column and put tokens in new column titled 'review_tokens'
def apply_on_column(data):
    data['review_tokens'] = data['review_text'].apply(lambda x: _clean_review(x))
    return data
    
"""

In [None]:
#Code to apply _process_review function on all review_text column and put tokens in new column titled 'review_tokens'
def apply_on_column(data):
    data['review_tokens'] = data['review_text'].apply(lambda x: _process_review(x))
    return data

In [None]:
"""
#Get times for how long it takes to run apply_on_column function on review sample
start = time.time()
apply_on_column(review_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(review_sample.shape[0], dur//60, dur%60))

"""

In [None]:
#Get times for how long it takes to run apply_on_column2 function on review sample
start = time.time()
apply_on_column(review_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(review_sample.shape[0], dur//60, dur%60))

In [None]:
#Check to see that the 'review_tokens' column was properly created
review_sample

In [None]:
#Print out example full review and its associated tokens after running _clean_review()
print('Full review:\n\n{}'.format(review_sample.loc[9, 'review_text']))
print('\n\nTokenized review: \n\n{}'.format(review_sample.loc[9, 'review_tokens']))

## Ngram codes for visualization

In [None]:
def get_ngrams(tokens, n):
    n_grams = ngrams(tokens, n)
    return [ ' '.join(grams) for grams in n_grams]

In [None]:
#make sure ngrams code is working for bi-grams
get_ngrams(review_sample.loc[9, 'review_tokens'], 2)

In [None]:
#what about tri-grams?
get_ngrams(review_sample.loc[9, 'review_tokens'], 3)

In [None]:
def apply_ngrams_on_column(data):
    for n in range(2,6):
        data['{}_grams'.format(n)] = data['review_tokens'].apply(lambda x: get_ngrams(x, n))
        print('Done creating {}-grams...'.format(n))
    return data

In [None]:
apply_ngrams_on_column(review_sample)

In [None]:
#Make sure the ngrams function worked as you thought by viewing a few exmples from the dataframe

print('Full review:\n\n{}'.format(review_sample.loc[9, 'review_text']))
print('\n\nTokenized review:\n\n{}'.format(review_sample.loc[9, 'review_tokens']))
print('\n\n2-grams review: \n\n{}'.format(review_sample.loc[9, '2_grams']))
print('\n\n3-grams review: \n\n{}'.format(review_sample.loc[9, '3_grams']))
print('\n\n4-grams review: \n\n{}'.format(review_sample.loc[9, '4_grams']))
print('\n\n5-grams review: \n\n{}'.format(review_sample.loc[9, '5_grams']))

## Counts of Tokens in Corpus

In [None]:
#Creates function to show us a vizualization of our top 50 token counts
#Adjust n to show n top tokens. Default is 50
def _get_top_tokens(tokens, n = 50):
    
    def dummy_fun(text):
        return text

    vectorizer = CountVectorizer(
    tokenizer = dummy_fun,
    preprocessor= dummy_fun,
    token_pattern=None)
    
    docs = vectorizer.fit_transform(tokens)
    features = vectorizer.get_feature_names()
    visualizer = FreqDistVisualizer(features=features, size=(1080, 720), n = n)
    visualizer.fit(docs)
    visualizer.poof()

#I'm going to update this function to spit out a histogram
def _get_least_tokens(tokens, n = 50):
    def dummy_fun(text):
        return text

    vectorizer = CountVectorizer(
    tokenizer = dummy_fun,
    preprocessor= dummy_fun,
    token_pattern=None)
    
    docs = vectorizer.fit_transform(tokens)
    counts = docs.sum(axis=0).A1
    features = vectorizer.get_feature_names()
    freq_distribution = Counter(dict(zip(features, counts)))
    return list(reversed(freq_distribution.most_common()[-n:]))

In [None]:
#Let's see what our top 50 tokens are!
_get_top_tokens(review_sample['review_tokens'])

In [None]:
_get_least_tokens(review_sample['review_tokens'], 100)

## TF-IDF Vectorization

In [None]:
def dummy_fun(text):
    return text

#This first TF-IDF function creates a vectorizer that takes the review text in string format (review_text column)
#So if our dataframe review_sample, it would take: review_sample['review_text']
#That's because it calls our pre-made preprocessor, _process_review
tfidf1 = TfidfVectorizer(
    tokenizer=_process_review,
    preprocessor=dummy_fun,
    token_pattern=None)

#This second TF-IDF function takes our already tokenized reviews, so the column review_sample['review_tokens']
#This essentially means that we need to run our custom preprocessor _process_review on our review text in raw form
tfidf2 = TfidfVectorizer(
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

In [None]:
#Run tfidf1
Y = tfidf1.fit_transform(review_sample['review_text'])

In [None]:
tfidf1.vocabulary_

In [None]:
idf_df1 = pd.DataFrame(Y.toarray(), columns=tfidf1.get_feature_names())
idf_df1

In [None]:
print(review_sample.loc[11, 'review_text'])
print(idf_df1.loc[11])

In [None]:
#Run tfidf2
Z = tfidf2.fit_transform(review_sample['review_tokens'])

In [None]:
idf_df2 = pd.DataFrame(Z.toarray(), columns=tfidf2.get_feature_names())
idf_df2

In [None]:
print(review_sample.loc[11, 'review_text'])
print(idf_df2.loc[11])