In [None]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

# Yelp businesses

In [None]:
dfb = pd.read_json('data/business.json', lines=True)

In [None]:
dfb.info()

In [None]:
dfb.head(1)

In [None]:
relevant_categories = set(['Restaurants','Food','Nightlife','Coffee & Tea'])
def check_relevance(ctgs):
    if isinstance(ctgs, str):
        ctgs = set(ctgs.split(', '))
        if ctgs.intersection(relevant_categories):
            return 1
        else:
            return 0
    else:
        return 0

In [None]:
dfb['relevance'] = dfb['categories'].apply(check_relevance)

In [None]:
print(len(dfb))
dfb = dfb[dfb.relevance == 1]
b_ids = set(dfb.business_id)
print(len(dfb))

In [None]:
dfb.to_pickle('data/YelpBusiness.pkl.gz')

# Yelp reviews clean and save

In [None]:
dfr = pd.read_json('data/review.json', lines=True)

In [None]:
print(len(dfr))
dfr = dfr[dfr.business_id.isin(b_ids)]
print(len(dfr))

In [None]:
dfr.info(verbose=True, null_counts=True)

In [None]:
stop_words = set(stopwords.words('english'))
punct = list(string.punctuation)

def clean_text(text):
    if isinstance(text, str):
        for ch in punct:
            text = text.replace(ch, '')
        word_tokens = text.lower().split() 
        clean_text = [w for w in word_tokens if not w in stop_words]
        ct = ' '.join(clean_text)
        return ct
    else:
        return None

In [None]:
dfr['clean_text'] = dfr['text'].progress_apply(clean_text)

In [None]:
dfr.stars.value_counts()

In [None]:
dfr = dfr.drop(['user_id','review_id','text'],axis=1)
print(len(dfr))
dfr = dfr.dropna(subset=['business_id', 'stars','clean_text'])
print(len(dfr))

In [None]:
dfr['sentiment'] = dfr['stars'].progress_apply(lambda s: 1 if s in [5,4] else 0)

In [None]:
dfr.to_pickle('data/YelpReview.pkl.gz')

# Yelp reviews stem and save

In [None]:
ps = PorterStemmer()
def stem_cleaned_text(text):
    if isinstance(text, str):
        word_tokens = text.split() 
        clean_text = [ps.stem(w) for w in word_tokens]
        ct = ' '.join(clean_text)
        return ct
    else:
        return None

In [None]:
dfr['stemmed_text'] = dfr['clean_text'].progress_apply(stem_cleaned_text)

In [20]:
dfr = dfr.drop(columns='clean_text')

In [None]:
dfr.to_pickle('data/YelpReviewStemmed.pkl.gz')