In [1]:
import sys
import itertools
import logging
from math import sqrt
from operator import add
from os.path import join, isfile, dirname

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

import json

from pyspark.mllib.fpm import FPGrowth

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
print("Running Spark Version %s" % (spark.version))

#### load business data ###
# path_business ="yizhan/Desktop/cs181/dataset/buiness.json"
path_business = "dataset/business.json"
df_business_raw = spark.read.json(path_business)
catDF = df_business_raw.select(df_business_raw["categories"])
catDF_iter = catDF.rdd.collect()    

Running Spark Version 2.2.0


In [3]:
catDF.count()

156639

In [4]:
data = catDF.rdd.filter(lambda item : 'Restaurants' in item.categories ).map(lambda item: set([x for x in item.categories if x !='Restaurants' and x != 'Food'])).filter(lambda item: len(item) > 1)

In [5]:
data.count()

34557

In [6]:
count = 0
cat_iter = data.collect()
for each in cat_iter:  
    print(each)
    count += 1
    if count > 10:
        break 

{'Soul Food', 'Convenience Stores'}
{'American (Traditional)', 'Bars', 'Nightlife', 'Burgers', 'Sports Bars'}
{'Mexican', 'Sandwiches', 'Italian', 'Diners', 'Breakfast & Brunch'}
{'American (Traditional)', 'Seafood'}
{'Bars', 'American (Traditional)', 'Nightlife', 'Comfort Food', 'Canadian (New)'}
{'Coffee & Tea', 'Taiwanese'}
{'American (Traditional)', 'Bars', 'Nightlife', 'Burgers', 'Chicken Wings'}
{'Irish', 'Pubs', 'Nightlife', 'Bars'}
{'Sandwiches', 'Chicken Wings', 'Pizza'}
{'Bars', 'Nightlife'}
{'Asian Fusion', 'Sushi Bars'}


In [47]:
# data.saveAsTextFile('category_data.csv')

In [7]:
model = FPGrowth.train(data.map(lambda item : list(item)), minSupport=0.001, numPartitions=10)
result = model.freqItemsets()
result_list = result.collect()

In [8]:
for fi in result_list:
    print(fi)

FreqItemset(items=['Party & Event Planning'], freq=120)
FreqItemset(items=['Party & Event Planning', 'Caterers'], freq=40)
FreqItemset(items=['Party & Event Planning', 'Caterers', 'Event Planning & Services'], freq=40)
FreqItemset(items=['Party & Event Planning', 'Venues & Event Spaces'], freq=36)
FreqItemset(items=['Party & Event Planning', 'Venues & Event Spaces', 'Event Planning & Services'], freq=36)
FreqItemset(items=['Party & Event Planning', 'Event Planning & Services'], freq=120)
FreqItemset(items=['Juice Bars & Smoothies'], freq=582)
FreqItemset(items=['Juice Bars & Smoothies', 'American (New)'], freq=46)
FreqItemset(items=['Juice Bars & Smoothies', 'Coffee & Tea'], freq=145)
FreqItemset(items=['Juice Bars & Smoothies', 'Coffee & Tea', 'Sandwiches'], freq=37)
FreqItemset(items=['Juice Bars & Smoothies', 'Coffee & Tea', 'Cafes'], freq=58)
FreqItemset(items=['Juice Bars & Smoothies', 'Ice Cream & Frozen Yogurt'], freq=48)
FreqItemset(items=['Juice Bars & Smoothies', 'Desserts'],

FreqItemset(items=['Cheesesteaks'], freq=162)
FreqItemset(items=['Cheesesteaks', 'Sandwiches'], freq=92)
FreqItemset(items=['Cheesesteaks', 'Burgers'], freq=39)
FreqItemset(items=['Cheesesteaks', 'American (Traditional)'], freq=36)
FreqItemset(items=['Wine Bars'], freq=805)
FreqItemset(items=['Wine Bars', 'American (New)'], freq=174)
FreqItemset(items=['Wine Bars', 'American (New)', 'Bars'], freq=174)
FreqItemset(items=['Wine Bars', 'American (New)', 'Bars', 'Nightlife'], freq=174)
FreqItemset(items=['Wine Bars', 'American (New)', 'Nightlife'], freq=174)
FreqItemset(items=['Wine Bars', 'Seafood'], freq=75)
FreqItemset(items=['Wine Bars', 'Seafood', 'Bars'], freq=75)
FreqItemset(items=['Wine Bars', 'Seafood', 'Bars', 'Nightlife'], freq=75)
FreqItemset(items=['Wine Bars', 'Seafood', 'Nightlife'], freq=75)
FreqItemset(items=['Wine Bars', 'Coffee & Tea'], freq=35)
FreqItemset(items=['Wine Bars', 'Coffee & Tea', 'Bars'], freq=35)
FreqItemset(items=['Wine Bars', 'Coffee & Tea', 'Bars', 'Nigh

In [9]:
cat_set_dict = {}
for item_set in result_list:
    main_label = item_set.items[0]
    current_freq = item_set.freq
    root_freq = current_freq
    if main_label in cat_set_dict:
        value = cat_set_dict[main_label]
        root_freq = value[0]
        label_set = value[1]
    else:
        label_set = [main_label]
        cat_set_dict[main_label] = [current_freq, label_set]
    
    if len(item_set) == 1:
        continue
    else:
        secondary_label = item_set.items[len(item_set.items)-1] # get the last label 
        if secondary_label not in label_set and current_freq > 0.5*root_freq:
            label_set.append(secondary_label)
            cat_set_dict[main_label] = [root_freq, label_set]
         
    

In [10]:
for k,v in cat_set_dict.items():
    print(k,v[1])

Party & Event Planning ['Party & Event Planning', 'Event Planning & Services']
Juice Bars & Smoothies ['Juice Bars & Smoothies']
Poke ['Poke', 'Hawaiian']
Fish & Chips ['Fish & Chips']
Sports Bars ['Sports Bars', 'American (Traditional)', 'Bars', 'Nightlife']
Kebab ['Kebab']
Dance Clubs ['Dance Clubs', 'Bars', 'Nightlife']
Greek ['Greek', 'Mediterranean']
Waffles ['Waffles', 'Breakfast & Brunch']
Hotels & Travel ['Hotels & Travel', 'Event Planning & Services']
Cafes ['Cafes']
Bowling ['Bowling', 'Active Life']
Health Markets ['Health Markets', 'Specialty Food']
Latin American ['Latin American']
Pool Halls ['Pool Halls', 'Bars', 'Nightlife']
Beer Bar ['Beer Bar', 'Bars', 'Nightlife']
Barbeque ['Barbeque']
Mongolian ['Mongolian']
Nightlife ['Nightlife']
Karaoke ['Karaoke', 'Bars', 'Nightlife']
Cantonese ['Cantonese', 'Chinese']
Arts & Entertainment ['Arts & Entertainment', 'Bars', 'Nightlife']
Music Venues ['Music Venues', 'Arts & Entertainment', 'Bars', 'Nightlife']
Coffee & Tea ['Coffe