In [None]:
import json
import pandas as pd
import re 
import glob 
import copy 
import os 
import nltk
import unicodedata 
from collections import Counter
from nltk.collocations import *
import numpy as np 

In [None]:
# --- Uncomment below if needed ---
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
#!python -m spacy download en_core_web_sm

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
from bs4 import BeautifulSoup
from html import unescape

In [None]:
%%time 
### Creating reviews dataframe 
data_file = open("yelp_academic_dataset_review.json",encoding='utf-8')
data = []
for line in data_file:
    data.append(json.loads(line))
reviewsdf = pd.DataFrame(data)
data_file.close()

In [None]:
%%time
### Creating businesses dataframe 
data_file = open("yelp_academic_dataset_business.json",encoding='utf-8')
data = []
for line in data_file:
    data.append(json.loads(line))
businessdf = pd.DataFrame(data)
data_file.close()

In [None]:
# original dataframes
reviewsdf_org = reviewsdf
businessdf_org = businessdf

In [None]:
%%time
city_and_state = businessdf[['business_id', 'city', 'state']]
reviewsdf = reviewsdf.merge(city_and_state, on='business_id')
reviewsdf

## Using smaller version of reviewsdf with just 30,000 businesses in California for faster processing and querying

In [None]:
reviewsdf = reviewsdf[reviewsdf['state'] == 'CA'].reset_index(drop=True)
reviewsdf

In [None]:
reviewsdf = reviewsdf.head(30000)
reviewsdf

### Generate hashtags from reviews df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
#Adjust mindf parameter
mindf = int(len(reviewsdf) * .001)
tfidf = TfidfVectorizer(sublinear_tf=True,
               analyzer='word',
#                max_features=2000,
               tokenizer=word_tokenize,
               stop_words=nltk.corpus.stopwords.words("english"),min_df=5)

In [None]:
text = list(reviewsdf['text'])
text = [re.sub(r'[^A-Za-z]+'," ",document) for document in text]

In [None]:
%%time
hashtagdf = pd.DataFrame(tfidf.fit_transform(text).toarray())

In [None]:
hashtagdf.columns = tfidf.get_feature_names()

In [None]:
hashtagdfsum = hashtagdf.sum()
tfidf_scores = [(i,hashtagdfsum[i]) for i in hashtagdfsum.index]
tfidf_scores.sort(key=lambda x : x[1],reverse=True)

In [None]:
##REMOVE?
lower_threshold = int(len(tfidf_scores) * .08)
upper_threshold = int(len(tfidf_scores) * .60)
scores = tfidf_scores[lower_threshold:]
tfidf_dict = {item[0] : item[1] for item in scores}

In [None]:
#Maybe add ability to get N most useful tags 
def generate_tags(text):
    tags = set()
    for s in text.split(' '):
        if s in tfidf_dict:
            tags.add(s)
    return list(tags)[:5]

In [None]:
reviewsdf['tags'] = reviewsdf['text'].apply(generate_tags)

In [None]:
reviewsdf.head(5)

In [None]:
# trigram_measures = nltk.collocations.TrigramAssocMeasures()


In [None]:
# finder = BigramCollocationFinder.from_words()
# finder1 = finder.apply_freq_filter(1)
# finder.nbest(bigram_measures.pmi, 1500)
# #Maybe add bigrams to each string, so they end upo in freq dict
# # list(bigrams)

# Create graph data

1 ) Create  weighted edges between business and associated hashtags (edge weight is number of reviews that contained particular hashtag?)

2) Create weighted edges between hashtags that occur in the same business (or review ?) weight = num co coccurances in reviews?

Drop unecessary columns from reviewsdf 

In [None]:
reviewsdf = reviewsdf[['review_id','text','business_id','city','state','tags']]
reviewsdf

In [None]:
###Create graph between business and associated hashtags 
from collections import defaultdict
businessAndTagGraph = defaultdict(lambda: defaultdict(int))
tagsGraph = defaultdict(lambda: defaultdict(int))

In [None]:
%%time
for review in reviewsdf.itertuples():
    businessId = review[3]
    tags = review[6]
    for i in range(len(tags)): 
        #Update Edge weights (number of times hashtag occurs in review)
        currTag = tags[i]
        businessAndTagGraph[businessId][currTag] += 1
        #Create edges and weights for tags in same review 
        for j in range(len(tags)):
            if i != j:
                 tagsGraph[currTag][tags[j]] += 1


#### Convert business and tags graph into dataframe/csv format

In [None]:
#businessAndTagsdf = pd.DataFrame(columns=['business_id','hashtag','count'])
idcol = []
hashtagcol = []
countcol = []
ignore = set(['index_names','column_names','index','columns','data'])
for businessid in businessAndTagGraph.keys():
    if businessid in ignore:
        continue
    for vals in businessAndTagGraph[businessid].items():
        idcol.append(businessid)
        hashtagcol.append(vals[0])
        countcol.append(vals[1])
        #businessAndTagsdf.append({'business_id':businessid,'hashtag':vals[0],count:vals[1]})


In [None]:
businessAndTagsdf = pd.DataFrame(columns = ['business_id','hashtag','count'])
businessAndTagsdf['business_id'] = idcol
businessAndTagsdf['hashtag'] = hashtagcol
businessAndTagsdf['count'] = countcol

In [None]:
#Resulting df 
businessdf = businessdf[['business_id','name','address','city','state','latitude','longitude','stars','categories']]

In [None]:
businessAndTagsdf = businessAndTagsdf.merge(businessdf,on='business_id')
businessAndTagsdf

##### Convert tags graph to df/csv format

In [None]:
tag1 = []
tag2 = []
sharedOccurances = []
for tag in tagsGraph.keys():
    for vals in tagsGraph[tag].items():
        tag1.append(tag)
        tag2.append(vals[0])
        sharedOccurances.append(vals[1])

In [None]:
tagsDf = pd.DataFrame(columns = ['tag1','tag2','sharedOccurances'])
tagsDf['tag1'] = tag1
tagsDf['tag2'] = tag2
tagsDf['sharedOccurances']= sharedOccurances

In [None]:
tagsDf

#### Save graphs to csv

In [None]:
businessAndTagsdf.to_csv('businessAndTags.csv')
tagsDf.to_csv('tagsGraph.csv')

# Import data into Neo4j

In [None]:
from neo4j import GraphDatabase
import neo4j

In [None]:
uri = "bolt://localhost:7666"
username = "neo4j"
password = "password"

def create_neo4j_session(uri, username, password):
    try:
        driver = GraphDatabase.driver(uri, auth=(username, password))
        session = driver.session()
        return session
    except Exception as e:
        print(f"Failed to create Neo4j session: {e}")
        return None

session = create_neo4j_session(uri, username, password)

In [None]:
# Deleting all existing constraints
deleting_constraints = "CALL apoc.schema.assert({}, {})"
session.run(deleting_constraints)

# Creating constraints for our graph
constraint_1 = "CREATE CONSTRAINT FOR (b:Business) REQUIRE b.id IS UNIQUE;"
constraint_2 = "CREATE CONSTRAINT FOR (t:tag) REQUIRE t.name IS UNIQUE;"

session.run(constraint_1)
session.run(constraint_2)

In [None]:
%%time
# Creating tag nodes and SHARED_TAGS relationship between tags with shared occurences
create_tags = """
LOAD CSV WITH HEADERS FROM 'file:///tagsGraph.csv' AS row  
WITH row
CALL {
WITH row
MERGE (t1:tag {id: row.tag1})
MERGE (t2:tag {id: row.tag2})
MERGE (t1)-[r:SHARED_TAGS]-(t2)
SET r.shared_count = row.sharedOccurances
} IN TRANSACTIONS"""

# takes about 30-45 minutes for version of reviewsdf with 30,000 rows from just California
session.run(create_tags)

In [None]:
%%time
# Creating Business nodes and HAS_TAG relationship between the business and every tag that business has had in their reviews
create_business = """
LOAD CSV WITH HEADERS FROM 'file:///businessAndTags.csv' AS row  
WITH row
CALL {
WITH row
MERGE (b:Business {id: row.business_id})
SET b.name =  row.name, 
    b.address = row.address,
    b.city = row.city,
    b.state = row.state,
    b.latitude = row.latitude,
    b.longitude = row.longitude,
    b.rating = row.stars,
    b.categories = split(row.categories, ', ')
MERGE (t:tag {name: row.hashtag})
MERGE (b)-[r:HAS_TAG]->(t)
SET r.tag_count = row.count
} IN TRANSACTIONS
"""

session.run(create_business)

## Applications

In [38]:
import redis
import time

# Connect to Redis container
r = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)

In [82]:
r.flushall()

True

### Application 1: Finding the n closest businesses that share a tag with the given business id
- Given a businessid and n, which is the number of businesses the user wants to query
- Finds businesses that share at least one common tag with the given business id AND has at least one matching category
- Calculates the distance between all of those businesses and the given business, and finds the n closest ones

In [83]:
def closest_similar_businesses(business_id, n):
    app1_query = """MATCH (b1:Business) - [ht1:HAS_TAG] -> (t:tag) <- [ht2:HAS_TAG] - (b2:Business)
    WHERE b1.id = '""" + business_id + """' AND b1 <> b2 AND ANY(category IN b2.categories WHERE category IN b1.categories)
    WITH b1, b2, t
    ORDER BY point.distance(point({longitude: toFloat(b1.longitude), latitude: toFloat(b1.latitude)}), point({longitude: toFloat(b2.longitude), latitude: toFloat(b2.latitude)})) ASC
    WITH DISTINCT b2, b1
    LIMIT """ + str(n) + """
    RETURN b2, point.distance(point({longitude: toFloat(b1.longitude), latitude: toFloat(b1.latitude)}), point({longitude: toFloat(b2.longitude), latitude: toFloat(b2.latitude)})) AS distance
    """
    query_inputs = business_id + " " + str(n)
    cached_data = r.get(query_inputs)
    # if data is in cache already, retrieve query result from cache and return it
    if cached_data is not None:
        return cached_data
    
    given_business_name = session.run(f"""MATCH (b:Business) WHERE b.id = "{business_id}" RETURN b.name""") # gets business name
    results = session.run(app1_query) # gets the result of app1_query
    
    # formulates the output to user from the results of the query
    output = str(n) + " closest similar businesses to " + given_business_name.value()[0] + ": \n\n"
    for ind, x in enumerate(results):
        output = output + str(ind + 1) + ": " + x.values()[0].get('name') + ", which is " + str(round(x.values()[1] / 1000, 2)) + " km(s) away" + "\n"
        
    # store query result in cache and return the query result
    r.set(query_inputs, output)
    return output

In [84]:
# demo of 5 closest similar businesses to Los Padres National Forest
print(closest_similar_businesses("B5XSoSG3SfvQGtKEGQ1tSQ", 5))

5 closest similar businesses to Los Padres National Forest: 

1: Run Montecito-Summerland, which is 21.06 km(s) away
2: Surf Happens, which is 21.55 km(s) away
3: Carpinteria Bluffs Nature Preserve, which is 23.48 km(s) away
4: Franceschi Park, which is 24.27 km(s) away
5: Paragon Jiu Jitsu and Kickboxing, which is 25.36 km(s) away



In [85]:
# demo of 3 closest similar businesses to Hibachi Steak House & Sushi Bar
print(closest_similar_businesses("gebiRewfieSdtt17PTW6Zg", 3))

3 closest similar businesses to Hibachi Steak House & Sushi Bar: 

1: Mad Dogs, which is 0.03 km(s) away
2: Barbarians, which is 0.04 km(s) away
3: Caffe Primo, which is 0.06 km(s) away



In [86]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    closest_similar_businesses("gebiRewfieSdtt17PTW6Zg", 8)
    end = time.time()
    print(i + 1, ":", end - start)

1 : 0.10937023162841797
2 : 0.0010113716125488281
3 : 0.0015058517456054688
4 : 0.0
5 : 0.0010075569152832031


### Application 2: Find n businesses in a given city that is the most similar to a given business id
- Given a businessid, a city name, and n, which is the number of businesses the user wants to query
- Finds businesses in the given city that share the most number of tags with the given business
- Return the top n business names

In [87]:
def most_similar_in_city(business_id, city, n):
    app2_query = """MATCH (b1:Business) - [ht1:HAS_TAG] -> (t:tag) <- [ht2:HAS_TAG] - (b2:Business)
    WHERE b1 <> b2 AND b2.city = '""" + city + """' AND b1.id = '""" + business_id + """' AND ANY(category IN b2.categories WHERE category IN b1.categories)
    WITH b1, b2, count(DISTINCT t) AS matching_tags
    ORDER BY matching_tags DESC
    LIMIT """ + str(n) + """
    RETURN b2.name AS name, matching_tags
    """
    query_inputs = business_id + " " + city + " " + str(n)
    cached_data = r.get(query_inputs)
    # if data is in cache already, retrieve query result from cache and return it
    if cached_data is not None:
        return cached_data
    
    given_business_name = session.run(f"""MATCH (b:Business) WHERE b.id = "{business_id}" RETURN b.name""") # gets business name
    results = session.run(app2_query) # gets the result of app2_query
    
    # formulates the output to user from the results of the query
    output = str(n) + " most similar businesses to " + given_business_name.value()[0] + " in the city of " + city + ":\n\n"
    for ind, x in enumerate(results):
        output = output + str(ind + 1) + ": " + x.values()[0] + ", with " + str(x.values()[1]) + " matching tags\n"
        
    # store query result in cache and return the query result
    r.set(query_inputs, output)
    return output

In [88]:
# demo of 5 most similar businesses in Goleta to Hibachi Steak House & Sushi Bar (Santa Barbara)
print(most_similar_in_city("gebiRewfieSdtt17PTW6Zg", "Goleta", 5))

5 most similar businesses to Hibachi Steak House & Sushi Bar in the city of Goleta:

1: The Original Habit Burger Grill, with 262 matching tags
2: In-N-Out Burger, with 211 matching tags
3: Phamous Cafe, with 208 matching tags
4: Sushiya Express, with 198 matching tags
5: Cal Taco, with 191 matching tags



In [89]:
# demo of 10 most similar businesses in Santa Barbara to Pho Bistro (Isla Vista)
print(most_similar_in_city("bdfZdB2MTXlT6-RBjSIpQg", "Santa Barbara", 10))

10 most similar businesses to Pho Bistro in the city of Santa Barbara:

1: Santa Barbara Shellfish Company, with 315 matching tags
2: Cold Spring Tavern, with 213 matching tags
3: Dawn Patrol, with 203 matching tags
4: Hibachi Steak House & Sushi Bar, with 201 matching tags
5: Backyard Bowls, with 193 matching tags
6: Bluewater Grill - Santa Barbara, with 186 matching tags
7: Chase Restaurant, with 167 matching tags
8: Zen Yai Thai Cuisine, with 142 matching tags
9: Sakana Sushi Bar & Japanese, with 139 matching tags
10: Deep Sea Tasting Room, with 134 matching tags



In [90]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    most_similar_in_city("gebiRewfieSdtt17PTW6Zg", "Santa Barbara", 8)
    end = time.time()
    print(i + 1, ":", end - start)

1 : 0.11350846290588379
2 : 0.0015158653259277344
3 : 0.0010046958923339844
4 : 0.0
5 : 0.0


### Application 3 -  Find the N highest rated business in a city that contains atleast one specified hashtag from a list of provided hashtags 

In [91]:
def getNHighestRatedFromCity(city,hashtags,n,sort):
    
    whereClause = 'WHERE'
    for i in range(len(hashtags)):
        if i == len(hashtags) -1:
            whereClause += f" t.name = '{hashtags[i]}'"
        else:
            whereClause += f" t.name = '{hashtags[i]}' OR"
            
    query = f'''MATCH p=(b : Business{{city:\'{city}'}})-[r:HAS_TAG]-(t:tag) {whereClause} RETURN b ORDER BY
    b.rating {sort} LIMIT {n}'''
    
    query_inputs = "highestRatedFromCity" + city + " " + str(hashtags) + " " + str(n) + " " + str(sort) + " "
    cached_data = r.get(query_inputs)
    
    if cached_data is not None:
        return cached_data
    res = session.run(query)
    output  = showOutputA3(res)
    r.set(query_inputs, output)
    
    return output

#The raw output of function above gives full record info, this function is a cleaner way to output results
def showOutputA3(records):
    output = ""
    for r in records:
        name = r['b']['name']
        city = r['b']['city']
        rating = r['b']['rating']
        output += (f'Business Name: {name}, City: {city}, Rating: {rating}\n')
    return output


Here is an example of what the raw output of the function looks like

In [92]:
getNHighestRatedFromCity('Santa Barbara',['awful'],1,'DESC')

'Business Name: Terra Malia Designs, City: Santa Barbara, Rating: 5.0\n'

Here is an example of the five lowest rated businesses that has reviews containing the hashtag awful

In [93]:
res = getNHighestRatedFromCity('Santa Barbara',['awful'],5,'ASC')
print(res)

Business Name: China Bowl & State Street Cafe, City: Santa Barbara, Rating: 2.0
Business Name: Artistic Nails & Spa, City: Santa Barbara, Rating: 2.0
Business Name: India House, City: Santa Barbara, Rating: 2.0
Business Name: Sandpiper Lodge, City: Santa Barbara, Rating: 2.5
Business Name: Tonic Nightclub, City: Santa Barbara, Rating: 2.5



Here is an example of the five highest rated businesses that has reviews containing the hashtag awful 

In [94]:
res = getNHighestRatedFromCity('Santa Barbara',['awful'],5,'DESC')
print(res)

Business Name: The Dress, City: Santa Barbara, Rating: 5.0
Business Name: Terra Malia Designs, City: Santa Barbara, Rating: 5.0
Business Name: Taqueria Cuernavaca, City: Santa Barbara, Rating: 4.5
Business Name: Cold Spring Tavern, City: Santa Barbara, Rating: 4.5
Business Name: 805 Ink, City: Santa Barbara, Rating: 4.5



Here is an example of the five lowest rated businesses that has reviews which contain either the hashtag 'awful' or 'dead' (as you can see, it is different from the output above)

In [95]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    res = getNHighestRatedFromCity('Santa Barbara',['awful','dead'],5,'DESC')
    #showOutputA3(res)
    end = time.time()
    print(i + 1, ":", end - start)
print("\nOUTPUT:")
print(res)

1 : 0.00741267204284668
2 : 0.0005283355712890625
3 : 0.0005371570587158203
4 : 0.0005171298980712891
5 : 0.0005445480346679688

OUTPUT:
Business Name: The Dress, City: Santa Barbara, Rating: 5.0
Business Name: Terra Malia Designs, City: Santa Barbara, Rating: 5.0
Business Name: Taqueria Cuernavaca, City: Santa Barbara, Rating: 4.5
Business Name: Cold Spring Tavern, City: Santa Barbara, Rating: 4.5
Business Name: 805 Ink, City: Santa Barbara, Rating: 4.5



### Application 4: Find top N hashtags that are associated with a specific business id 


In [96]:
exampleBusinessId1 = 'qdpdUjdkEUpzYDdce2yZng'
exampleBusinessId2 = 'B5XSoSG3SfvQGtKEGQ1tSQ'

In [97]:
def topNHashtagsForBusiness(businessId,k):
    
    query_inputs = 'topKHashtagsPerBusiness'+ " " + businessId + " "+ str(k)
    query = f'''Match(b:Business{{id:'{businessId}'}})-[r:HAS_TAG]-(t)  return t,r.tag_count Order by r.tag_count DESC LIMIT {k}'''
    cached_data = r.get(query_inputs)
    
    if cached_data is not None:
        return cached_data
    
    res = session.run(query)
    output = showOutputA4(res)
    r.set(query_inputs,output)
    return output 

def showOutputA4(records):
    output = ''
    for r in records:
        tag = r['t']['name']
        count = r['r.tag_count']
        output += (f'Hashtag: {tag}, Count: {count}' + '\n')
    return output 

Here you can see an example of the top 10 hashtags associated with exampleBusinessId1 defined above

In [98]:
res = topNHashtagsForBusiness(exampleBusinessId1,10)
print(res)

Hashtag: turkey, Count: 8
Hashtag: subs, Count: 7
Hashtag: sub, Count: 7
Hashtag: soda, Count: 7
Hashtag: plaza, Count: 6
Hashtag: squaw, Count: 5
Hashtag: roast, Count: 5
Hashtag: classic, Count: 4
Hashtag: walls, Count: 4
Hashtag: vintage, Count: 4



Here you can see an example of the top 10 hashtags associated with exampleBusinessId2 defined above as well as a demonstration of speed up from cached queries 

In [99]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    res = topNHashtagsForBusiness(exampleBusinessId2,10)
    end = time.time()
    print(i + 1, ":", end - start)
print('\nResult: ')
print(res)

1 : 0.00600123405456543
2 : 0.0009984970092773438
3 : 0.0
4 : 0.0010404586791992188
5 : 0.0

Result: 
Hashtag: national, Count: 5
Hashtag: forest, Count: 3
Hashtag: hikes, Count: 2
Hashtag: areas, Count: 2
Hashtag: follow, Count: 1
Hashtag: fault, Count: 1
Hashtag: flowing, Count: 1
Hashtag: headed, Count: 1
Hashtag: miles, Count: 1
Hashtag: shade, Count: 1



### Application 5 ( Hashtag Relationships): Find N most common hashtags that occur in the same reviews as provided hashtag 

Return restaurants that have reviews containing these hashtags

In [100]:
def findBusinessesWithNMostCommonTags(hashtag,n,k):
    query_inputs = 'findBusinessesWithMostCommonTags' + 'x' + hashtag + ' ' + str(n) + str(k)
    query = f'''
    Match p = (t1:tag{{id:'{hashtag}'}})-[r1:SHARED_TAGS]-(t2:tag)  
    Match (b:Business)-[HAS_TAG]-(t3:tag{{name:t2.id}})
    RETURN b,t2,t3,r1.shared_count ORDER BY r1.shared_count DESC Limit {k}
    '''
    cached_data = r.get(query_inputs)
    if cached_data is not None:
        return cached_data
    
    res = session.run(query)
    output = showOutputA5(res)
    r.set(query_inputs,output)
    return output
def showOutputA5(records):
    output = ''
    for r in records:
        name = r['b']['name']
        commonTag  = r['t2']['id']
        tagCount = r['r1.shared_count']
        output += (f'Business: {name} | Tag Name: {commonTag} | Tag Count: {tagCount}' + '\n')
    return output

In [101]:
res = (findBusinessesWithNMostCommonTags('awful',3, 10))
print(res)

Business: Cava Restaurant & Bar | Tag Name: hold | Tag Count: 3
Business: Santa Barbara Humane  | Tag Name: hold | Tag Count: 3
Business: Carpinteria 76 | Tag Name: hold | Tag Count: 3
Business: Will Nelson Fitness | Tag Name: hold | Tag Count: 3
Business: ParadICE Hawaiian Shave Ice & Ice Cream | Tag Name: hold | Tag Count: 3
Business: Loose Pooch Dog Club | Tag Name: hold | Tag Count: 3
Business: Gossip | Tag Name: hold | Tag Count: 3
Business: Seaside Gardens | Tag Name: hold | Tag Count: 3
Business: Wheel Fun Rentals | Tag Name: hold | Tag Count: 3
Business: Bluewater Grill - Santa Barbara | Tag Name: hold | Tag Count: 3



In [102]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    res = (findBusinessesWithNMostCommonTags('smelly',30, 5))
    end = time.time()
    print(i + 1, ":", end - start)
print('\nResult:')
print(res)

1 : 0.010545492172241211
2 : 0.0010008811950683594
3 : 0.0009882450103759766
4 : 0.0
5 : 0.0009999275207519531

Result:
Business: Santa Barbara Shellfish Company | Tag Name: abalone | Tag Count: 1
Business: Santa Barbara Certified Farmers Market | Tag Name: baguette | Tag Count: 1
Business: Pho Bistro | Tag Name: baguette | Tag Count: 1
Business: Farmer Boy | Tag Name: baguette | Tag Count: 1
Business: Phamous Cafe | Tag Name: baguette | Tag Count: 1



### Application 6: Given a certain category, what are the top hashtags for that category

In [103]:
def top_hashtags_per_category(category):
    query = f"""MATCH (b:Business)-[:HAS_TAG]->(t:tag)
WHERE '{category}' IN b.categories
RETURN t, COUNT(t)
ORDER BY COUNT(t) DESC
LIMIT 10"""
    
    
    query_inputs = f"{category}"
    cached_data = r.get(query_inputs)

    if cached_data is not None:
        return cached_data
    
    result = session.run(query)
    
    output = f"Top Hashtags for {category} Category:\n\n"
    
    for ind, record in enumerate(result):
        tag = record['t']['name']
        count = record['COUNT(t)']
        output = output + f'Hashtag: {tag}, Count: {count}\n'

    r.set(query_inputs, output)
    return output

In [104]:
print(top_hashtags_per_category('Sandwiches'))

Top Hashtags for Sandwiches Category:

Hashtag: turkey, Count: 7
Hashtag: crunchy, Count: 7
Hashtag: support, Count: 7
Hashtag: classic, Count: 6
Hashtag: picking, Count: 6
Hashtag: standing, Count: 6
Hashtag: employee, Count: 5
Hashtag: toasted, Count: 5
Hashtag: particularly, Count: 5
Hashtag: purchased, Count: 5



In [105]:
print(top_hashtags_per_category('Bakeries'))

Top Hashtags for Bakeries Category:

Hashtag: stand, Count: 5
Hashtag: wide, Count: 5
Hashtag: cookies, Count: 5
Hashtag: support, Count: 5
Hashtag: daily, Count: 5
Hashtag: treats, Count: 5
Hashtag: croissant, Count: 4
Hashtag: cookie, Count: 4
Hashtag: become, Count: 4
Hashtag: ton, Count: 4



In [106]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    top_hashtags_per_category('Restaurants')
    end = time.time()
    print(i + 1, ":", end - start)

1 : 0.021537303924560547
2 : 0.0015077590942382812
3 : 0.0
4 : 0.0010225772857666016
5 : 0.0


###  Application 7: N most common hashtags in a particular city

In [107]:
def n_common_city_hashtags(city, N):
    query = f"""MATCH (b:Business)-[:HAS_TAG]->(t:tag)
WHERE b.city = '{city}'
RETURN t, COUNT(t)
ORDER BY COUNT(t) DESC
LIMIT {N}"""
    
    query_inputs = f"{city} {N}"
    cached_data = r.get(query_inputs)
    
    if cached_data is not None:
        return cached_data
    
    result = session.run(query)
    
    output = f"{N} Most Common Hashtags for {city}:\n\n"
    
    for ind, record in enumerate(result):
        tag = record['t']['name']
        count = record['COUNT(t)']
        output = output + f'Hashtag: {tag}, Count: {count}\n'
        
    r.set(query_inputs, output)
    return output

In [108]:
print(n_common_city_hashtags('Santa Barbara', 10))

10 Most Common Hashtags for Santa Barbara:

Hashtag: complete, Count: 76
Hashtag: happen, Count: 72
Hashtag: recent, Count: 70
Hashtag: hold, Count: 70
Hashtag: support, Count: 68
Hashtag: finished, Count: 67
Hashtag: ton, Count: 66
Hashtag: feels, Count: 65
Hashtag: positive, Count: 65
Hashtag: certainly, Count: 64



In [109]:
print(n_common_city_hashtags('Goleta', 4))

4 Most Common Hashtags for Goleta:

Hashtag: finished, Count: 23
Hashtag: yesterday, Count: 20
Hashtag: complete, Count: 20
Hashtag: employee, Count: 18



In [110]:
# demo of how cache speeds up queries that have been made before
for i in range(5):
    start = time.time()
    n_common_city_hashtags('Carpinteria', 3)
    end = time.time()
    print(i + 1, ":", end - start)

1 : 0.009135007858276367
2 : 0.0010020732879638672
3 : 0.0
4 : 0.0009982585906982422
5 : 0.0
