In [1]:
import json
import os 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
from tqdm import tqdm 
from rake_nltk import Rake
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy

In [2]:
# Convert these dicts to csv file

keyword_statistic = {}
keyword_statistic["keywords"] = []
keyword_statistic["review frequency"] = []
keyword_statistic["user id"] = set()

user_statistic = {}
user_statistic["id"] = []
user_statistic["keywords"] = []
user_statistic["review count"] = []
user_keyword = {}
keyword_user = {}

In [3]:
def create_dict_of_reviews_info(city_review_dir):
    restaurants_dir_list = os.listdir(city_review_dir)
    reviews_info = {}  # dict to store review info by id 

    for restaurant_dir in tqdm(restaurants_dir_list, total=len(restaurants_dir_list)):
        if restaurant_dir.startswith('.'):
            continue # macos issue

        restaurant_dir = os.path.join(city_review_dir, restaurant_dir)
        reviews = json.load(open(restaurant_dir))
        reviews_info.update(reviews)

    return reviews_info

In [4]:
city_review_dir = 'reviews_users/reviews_users/singapore/extracted_reviews'

document_frequency = {} # keyword - document frequency
review_keywords = {}  # id - dict of (keywords, sentence frequency)

# dict with keys are review ids, values are dicts of review info, values' keys are ['date', 'text', 'rating', 'rated', 'photos', 'user_id']
reviews_info = create_dict_of_reviews_info(city_review_dir)

100%|██████████| 984/984 [00:08<00:00, 117.62it/s]


In [5]:
for id in list(reviews_info.keys()):
    print(list(reviews_info[id].keys()))
    break

def load_stopwords(ifile):
    with open(ifile, 'r') as reader:
        tmp = [a.strip() for a in reader.readlines()]  #strip to remove space

    return set(tmp)

def word_only_tokenizer(txt):
    tmp = nltk.tokenize.wordpunct_tokenize(txt)
    return [a for a in tmp if a.isalpha()]

def init_raker_nltk(stop_file='Stopwords.txt', word_tokenizer=word_only_tokenizer):
    stopwords = load_stopwords(stop_file)
    return Rake(stopwords=stopwords, word_tokenizer=word_tokenizer)

def ext_keywords_rake_nltk(text, raker_nltk): # return a list of tuples
    raker_nltk.extract_keywords_from_text(text)
    return raker_nltk.get_ranked_phrases_with_scores()

def extract_keywords_from_review(review_info, raker_nltk):
    tk_with_scores = ext_keywords_rake_nltk(review_info['text'], raker_nltk) # list of tuples
    return tk_with_scores


def extract_and_count_keywords_for_city(reviews_info, raker_nltk): # -> review_keyword, document_frequency
    for review_id in list(reviews_info.keys()):
        tk_with_score = extract_keywords_from_review(reviews_info[review_id], raker_nltk)
        review_keywords[review_id] = {}

        for score, kw in tk_with_score:
            if kw not in review_keywords[review_id]:
                document_frequency[kw] = document_frequency.get(kw, 0) + 1
                
            review_keywords[review_id][kw] = review_keywords[review_id].get(kw, 0) + 1
        


['date', 'text', 'rating', 'rated', 'photos', 'user_id']


In [6]:
raker_nltk = init_raker_nltk(stop_file='Stopwords.txt', word_tokenizer=word_only_tokenizer)

extract_and_count_keywords_for_city(reviews_info, raker_nltk)

In [7]:
set_of_keywords = set()

TFIDF_Average = {} # keyword : (sum of TFIDF / document frequency)

for id in list(review_keywords.keys()):
    if len(review_keywords[id]) == 0:
        continue

    for kw, fre in review_keywords[id].items():
        if document_frequency[kw] < 4:
            continue
        TF = fre / len(review_keywords[id])
        IDF = math.log(len(review_keywords)/ document_frequency[kw], 2)
        TFIDF_Average[kw] = TFIDF_Average.get(kw, 0) + TF*IDF/document_frequency[kw]



sorted_keywords = sorted(TFIDF_Average.items(), key=lambda x:x[1]) 

# remove keyword with low tf-idf value, add to set_of_keyword
for pair in sorted_keywords[::-1]:
    if pair[1] < 0.08:
        break
    set_of_keywords.add(pair[0])

# print(set_of_keywords)

keyword_statistic["keywords"] = list(set_of_keywords)
for kw in keyword_statistic["keywords"]:
    keyword_statistic["review frequency"].append(document_frequency[kw])


In [8]:
print(len(user_statistic["keywords"]))
def user_keyword_review_count(reviews_info):
    user_keyword_count = {}
    user_review_count = {}
    for id in list(reviews_info.keys()):
        user_review_count[reviews_info[id]['user_id']] = user_review_count.get(reviews_info[id]['user_id'], 0) + 1
        for kw in review_keywords[id]:
            if kw in set_of_keywords: # only consider keyword in set_of_keyword
                
                if kw not in keyword_user:
                    keyword_user[kw] = set()
                keyword_user[kw].add(reviews_info[id]['user_id'])

                if reviews_info[id]['user_id'] not in user_keyword:
                    user_keyword[reviews_info[id]['user_id']] = set()
                user_keyword[reviews_info[id]['user_id']].add(kw)
                
                user_keyword_count[reviews_info[id]['user_id']] = user_keyword_count.get(reviews_info[id]['user_id'], 0) + 1

    return (user_keyword_count, user_review_count)

(user_keyword_count, user_review_count) = user_keyword_review_count(reviews_info)
keyword_statistic["user id"] = list(keyword_statistic["user id"])

for kw in keyword_statistic['keywords']:
    keyword_statistic["user id"].append(list(keyword_user[kw]))

0


In [9]:
city = "singapore"
dt = json.load(open('users_split_1.json'))

# statisticize for user in user_test_split

for user_id in dt[city]['test']:
    user_statistic["id"].append(user_id)

for user_id in dt[city]['train']:
    user_statistic["id"].append(user_id)

for user_id in dt[city]['val']:
    user_statistic["id"].append(user_id)

In [10]:
for id in user_statistic["id"]: 
    user_statistic["review count"].append(user_review_count[id])
    if id not in user_keyword:
        user_statistic["keywords"].append([])
    else:
        user_statistic["keywords"].append(list(user_keyword[id]))
    # print(id)


print(len(user_statistic["id"]))
print(len(user_statistic["keywords"]))

0
2171
2171


In [11]:
print(user_statistic.keys())
df1 = pd.DataFrame(user_statistic)
df2 = pd.DataFrame(keyword_statistic)

df1.to_csv(r'data_csv/user_statistic.csv', header=True)
df2.to_csv(r'data_csv/keyword_statistic.csv', header=True)

dict_keys(['id', 'keywords', 'review count'])
