In [1]:
import json
import pandas as pd
import numpy as np
import os

from typing import Optional, List, Dict

In [2]:
def read_data(path: str, nrows: int) -> pd.DataFrame:
    with open(path, "r") as f:
        return pd.read_json(f, orient="records", lines=True, nrows=nrows)

In [3]:
business_df = read_data("../data/yelp_academic_dataset_business.json", 1000)
checkin_df = read_data("../data/yelp_academic_dataset_checkin.json", 1000)
review_df = read_data("../data/yelp_academic_dataset_review.json", 1000)
tip_df = read_data("../data/yelp_academic_dataset_tip.json", 1000)
user_df = read_data("../data/yelp_academic_dataset_user.json", 1000)

In [4]:
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
# preprocessing

import re

def normalize(text):
    """
    Remove all punctuations low case and trim the text, amd split the text by space/newline
    """
    
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    text = text.strip()

    return text.split()

def count_words(corpus):
    """
    Count the number of words in a text
    """

    freqs = {}
    for doc in corpus:
        for word in doc:
            if word not in freqs:
                freqs[word] = 1
            else:
                freqs[word] += 1

    return freqs


review_df["text_normalized"] = review_df["text"].str.lower().str.replace(r"[^a-z0-9]+", " ", regex=True).str.strip().str.split()
vocab = count_words(review_df.text_normalized)
vocab_df = pd.DataFrame.from_dict(vocab, orient="index").sort_values(by=0, ascending=False)
vocab_df.columns = ["freq"]

In [9]:

def calculate_tfidf(corpus: List[str],) -> Dict:

    """calcuate tfidf for each word and store in a dictionary.
    vocab is tf for each word
    """

    tfidf = {}
    D = {}

    vocab = set([word for doc in corpus for word in doc ])

    for word in vocab:
        for doc in corpus:
            if word in doc:
                if word not in D:
                    D[word] = 1
                else:
                    D[word] += 1

    for doc in corpus:
        for word in set(doc):
            if word not in tfidf:
                tf = doc.count(word) / len(doc)
                idf = np.log(len(corpus) / (1 + D[word]))
                tfidf[word] = tf * idf

    return tfidf


In [10]:
vocab_df["tfidf"] = vocab_df.index.map(calculate_tfidf(review_df.text_normalized,))

In [11]:
vocab_df.sort_values(by="tfidf", ascending=False, inplace=True)
vocab_df["is_stopword"] = np.where(vocab_df.tfidf < 0.02, True, False)

In [14]:
vocab_df[(~vocab_df.is_stopword) & (vocab_df.freq >= 30)].index

Index(['beef', 'reasonable', 'parking', 'pork', 'walk', 'prices', 'stopped',
       'coffee', 'old', 'lunch',
       ...
       'thought', 'part', 'review', 'line', 'after', 'being', 'front', 'check',
       'to', 'enjoyed'],
      dtype='object', length=315)