In [None]:
import psycopg2
import nltk
import unicodedata
import pandas as pd
import pprint
import pickle
import re
import os
from nltk.corpus import wordnet
import time
from nltk.tokenize import RegexpTokenizer
wnl = nltk.WordNetLemmatizer()
nltk.download('averaged_perceptron_tagger')

In [None]:
#Input your PostGres credentials to connect

dbname = ''
username = ''
host = ''
password = ''

conn = psycopg2.connect('dbname={} user={} host={} password={}'.format(dbname, username, host, password))
cur = conn.cursor()

In [None]:
#Adjust the sample size by changing the number of instances you request following LIMIT

cur = conn.cursor()
cur.execute("""
    SELECT * FROM review LIMIT 100
""")

cols = ['review_id', 'user_id', 'business_id', 'stars', 'review_date', 'review_text', 'useful', 'funny', 'cool']

review_sample = pd.DataFrame(cur.fetchall(), columns=cols)

In [None]:
#make sure you got the sample
review_sample

In [None]:
#View specific instance
print(review_sample.loc[24, 'review_text'])

In [None]:
#Function to create customized stopword list that retains words with negative connotation and removes common,
#non-negative contrations
def _create_stop_words():

    stops = nltk.corpus.stopwords.words('english')
    
    neg_stops = ['no',
     'nor',
     'not',
     'don',
     "don't",
     'ain',
     'aren',
     "aren't",
     'couldn',
     "couldn't",
     'didn',
     "didn't",
     'doesn',
     "doesn't",
     'hadn',
     "hadn't",
     'hasn',
     "hasn't",
     'haven',
     "haven't",
     'isn',
     "isn't",
     'mightn',
     "mightn't",
     'mustn',
     "mustn't",
     'needn',
     "needn't",
     'shan',
     "shan't",
     'shouldn',
     "shouldn't",
     'wasn',
     "wasn't",
     'weren',
     "weren't",
     "won'",
     "won't",
     'wouldn',
     "wouldn't",
     'but',
     "don'",
     "ain't"]

    common_nonneg_contr = ["could've",
    "he'd",
    "he'd've",
    "he'll",
    "he's",
    "how'd",
    "how'll",
    "how's",
    "i'd",
    "i'd've",
    "i'll",
    "i'm",
    "i've",
    "it'd",
    "it'd've",
    "it'll",
    "it's",
    "let's",
    "ma'am",
    "might've",
    "must've",
    "o'clock",
    "'ow's'at",
    "she'd",
    "she'd've",
    "she'll",
    "she's",
    "should've",
    "somebody'd",
    "somebody'd've",
    "somebody'll",
    "somebody's",
    "someone'd",
    "someone'd've",
    "someone'll",
    "someone's",
    "something'd",
    "something'd've",
    "something'll",
    "something's",
    "that'll",
    "that's",
    "there'd",
    "there'd've",
    "there're",
    "there's",
    "they'd",
    "they'd've",
    "they'll",
    "they're",
    "they've",
    "'twas",
    "we'd",
    "we'd've",
    "we'll",
    "we're",
    "we've",
    "what'll",
    "what're",
    "what's",
    "what've",
    "when's",
    "where'd",
    "where's",
    "where've",
    "who'd",
    "who'd've",
    "who'll",
    "who're",
    "who's",
    "who've",
    "why'll",
    "why're",
    "why's",
    "would've",
    "y'all",
    "y'all'll",
    "y'all'd've",
    "you'd",
    "you'd've",
    "you'll",
    "you're",
    "you've"]

    for x in neg_stops:
        if x in stops:
            stops.remove(x)
        
    new_stops = stops + common_nonneg_contr + [""] + ['us']
    stops = list(set(new_stops))
    return stops

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].lower()
    tag_dict = {"a": wordnet.ADJ,
                "n": wordnet.NOUN,
                "v": wordnet.VERB,
                "r": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def _clean_review(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8', 'ignore')
    tokenizer = nltk.RegexpTokenizer('\w+\'?\w+')
    filtered_tokens = [(re.sub(r"[^A-Za-z\s']", '', token)) for token in tokenizer.tokenize(text)]
    stops = _create_stop_words()
    tokens = [token for token in filtered_tokens if token not in stops]
    for i, token in enumerate(tokens):
        filtered_token = re.sub("'s", '', token)
        tokens[i] = wnl.lemmatize(filtered_token, pos= get_wordnet_pos(token))
    return tokens

In [None]:
#Code to apply _clean_review function on all review_text column and put tokens in new column titled 'review_tokens'
def apply_on_column(data):
    data['review_tokens'] = data['review_text'].apply(lambda x: _clean_review(x))
    return data

In [None]:
#Get times for how long it takes to run apply_on_column function on review sample
start = time.time()
apply_on_column(review_sample)
end = time.time()
dur = end - start
# Verify that the function is working
print('Processed {} instances in {} minutes {} seconds.\n'.format(review_sample.shape[0], dur//60, dur%60))

In [None]:
#Print out example full review and its associated tokens after running _clean_review()
print('Full review:\n\n{}'.format(review_sample.loc[24, 'review_text']))
print('\n\n Tokenized review: \n\n{}'.format(review_sample.loc[24, 'review_tokens']))