In [15]:
from pyspark.mllib.recommendation import Rating
import numpy as np
import json, time
from operator import add

In [2]:
filepath = "data/reviews.jl"
save_dir = "target/"

In [3]:
# Load and parse the data
data = sc.textFile(filepath)
ratings = data.map(lambda l: json.loads(l))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
ratings.count()

2685066

In [4]:
business_dict = sc.textFile("data/business.jl").map(lambda r: json.loads(r))\
                .map(lambda  r: (r[0], (r[1], r[2]))).collectAsMap()

In [5]:
user_dict = sc.textFile("data/user.jl").map(lambda r: json.loads(r))\
                .map(lambda  r: (r[0], r[1])).collectAsMap()

In [6]:
len(business_dict)

26729

In [7]:
business_dict[1]

(u'Mr Hoagie, Dravosburg', [u'Fast Food'])

In [8]:
def loadModel(name):
    start_time = time.time()
    # Load vUser
    vUser = sc.pickleFile(save_dir+name+"/vUser")
    # Load vBusiness
    vBusiness = sc.pickleFile(save_dir+name+"/vBusiness")
    # Load Model
#     model = MatrixFactorizationModel.load(sc, save_dir+name+"/cfModel")
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return vUser, vBusiness

In [9]:
vUser, vBusiness = loadModel('final')
vUser, vBusiness

Time Cost: 0.16s


(MapPartitionsRDD[11] at objectFile at NativeMethodAccessorImpl.java:-2,
 MapPartitionsRDD[13] at objectFile at NativeMethodAccessorImpl.java:-2)

In [10]:
vUser.cache()
vUser.count()

686556

In [11]:
vBusiness.cache()
vBusiness.count()

85539

In [12]:
def getKNN(vUser, vu, k):
    return vUser.map(lambda r: (r[0], np.linalg.norm(vu-r[1]))).top(k, key=lambda r: -r[1])

def getUserFeature(vUser, uid):
    a = vUser.filter(lambda r: r[0] == uid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getProductFeature(vBusiness, bid):
    a = vBusiness.filter(lambda r: r[0] == bid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getRecommedProduct(vBusiness, vu, num=10):
    return map(
            lambda r: r[0],
            vBusiness.filter(lambda r: r[0] in business_dict)\
                .map(lambda r: (r[0], np.array(r[1]).dot(vu))).top(num, key=lambda r: r[1])
        )

In [13]:
def getTaste(uid):
    start_time = time.time()
    vu = getUserFeature(vUser, uid)
    res = getRecommedProduct(vBusiness, vu, 1000)
    recomd = map(lambda r: business_dict[r][0], res[:5])
    users = map(lambda r: user_dict[r[0]], getKNN(vUser, vu, 6)[1:] )
    taste = sc.parallelize(res).flatMap(lambda r: business_dict[r][1])\
        .map(lambda r: (r, 1)).reduceByKey(add).takeOrdered(50, key=lambda r: -r[1])
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return recomd, users, taste

In [20]:
recomd, users, taste = getTaste(400)

Time Cost: 15.65s


In [21]:
recomd

[u'Katoreya, Pointe-Claire',
 u'Pizza Prosciutto, Kitchener',
 u'1001 Nights Shawarma, Kitchener',
 u'Chez la M\xe8re Michel, Montr\xe9al',
 u'The Curve, Madison']

In [22]:
users

[u'Conor', u'Patty', u'Matthew', u'Tina', u'Carly']

In [23]:
taste

[(u'American (Traditional)', 96),
 (u'Sandwiches', 92),
 (u'Pizza', 90),
 (u'Cafes', 79),
 (u'Italian', 68),
 (u'Mexican', 67),
 (u'Burgers', 58),
 (u'Chinese', 57),
 (u'American (New)', 51),
 (u'Sushi Bars', 42),
 (u'Coffee & Tea', 40),
 (u'Fast Food', 39),
 (u'Japanese', 33),
 (u'Mediterranean', 32),
 (u'Pubs', 32),
 (u'French', 31),
 (u'Barbeque', 31),
 (u'Indian', 28),
 (u'Delis', 26),
 (u'Bakeries', 22),
 (u'Thai', 21),
 (u'Seafood', 21),
 (u'Canadian (New)', 20),
 (u'Vietnamese', 20),
 (u'Greek', 18),
 (u'Steakhouses', 18),
 (u'Asian Fusion', 17),
 (u'Ice Cream & Frozen Yogurt', 17),
 (u'Sports Bars', 17),
 (u'Salad', 14),
 (u'Middle Eastern', 13),
 (u'Vegetarian', 13),
 (u'Specialty Food', 13),
 (u'Chicken Wings', 12),
 (u'Korean', 12),
 (u'Soup', 11),
 (u'Tex-Mex', 11),
 (u'Latin American', 11),
 (u'Vegan', 11),
 (u'Food Trucks', 11),
 (u'Gluten-Free', 10),
 (u'Desserts', 10),
 (u'Caribbean', 9),
 (u'Gastropubs', 9),
 (u'Event Planning & Services', 9),
 (u'Peruvian', 8),
 (u'Po

In [16]:
recomd, users, taste = getTaste(9)

Time Cost: 16.92s


In [17]:
recomd

[u'So Good Cafe, Las Vegas',
 u'Truffles N Bacon Caf\xe9, Las Vegas',
 u'Pizza Ben, Montr\xe9al',
 u"Zarra's A Taste of Southern Italy, Pittsburgh",
 u'Anise Tapas & Grill, Las Vegas']

In [18]:
users

[u'Gerard', u'John', u'Gregory', u'Vic', u'Irene']

In [19]:
taste

[(u'Pizza', 117),
 (u'Mexican', 114),
 (u'Sandwiches', 108),
 (u'American (Traditional)', 78),
 (u'Italian', 77),
 (u'Fast Food', 74),
 (u'American (New)', 69),
 (u'Chinese', 65),
 (u'Japanese', 51),
 (u'Burgers', 45),
 (u'Cafes', 45),
 (u'Mediterranean', 38),
 (u'Sushi Bars', 35),
 (u'Delis', 35),
 (u'Thai', 31),
 (u'Seafood', 30),
 (u'Asian Fusion', 27),
 (u'Coffee & Tea', 27),
 (u'Steakhouses', 23),
 (u'Barbeque', 23),
 (u'Chicken Wings', 22),
 (u'Middle Eastern', 20),
 (u'Salad', 19),
 (u'Bakeries', 18),
 (u'Indian', 18),
 (u'Greek', 17),
 (u'Vegan', 17),
 (u'French', 17),
 (u'Vegetarian', 17),
 (u'Latin American', 16),
 (u'Hot Dogs', 16),
 (u'Juice Bars & Smoothies', 15),
 (u'Sports Bars', 14),
 (u'Hawaiian', 14),
 (u'Vietnamese', 14),
 (u'Food Trucks', 14),
 (u'Gluten-Free', 13),
 (u'Desserts', 13),
 (u'Specialty Food', 13),
 (u'Pubs', 10),
 (u'Ice Cream & Frozen Yogurt', 10),
 (u'Cocktail Bars', 10),
 (u'Korean', 9),
 (u'Wine Bars', 9),
 (u'Ethnic Food', 8),
 (u'Grocery', 8),
 (