## Import Library

In [5]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
import numpy as np
import json, time
from operator import add

## Define File Path & ALS Setting

In [6]:
filepath = "data/reviews.jl"
save_dir = "target/"
ALS_setting = {
    'rank': 10,
    'numIterations': 20
}

## Load & Process Data

In [7]:
# Load and parse the data
data = sc.textFile(filepath)
ratings = data.map(lambda l: json.loads(l))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

In [8]:
ratings.take(5)

[Rating(user=511458, product=1, rating=4.0),
 Rating(user=291006, product=1, rating=5.0),
 Rating(user=551918, product=1, rating=5.0),
 Rating(user=307073, product=1, rating=3.0),
 Rating(user=658213, product=1, rating=2.0)]

## Train & Test Data

In [9]:
def splitRatings(ratings, bound=1000):
    c = ratings.count()
    print("Number of Reviews: %d"%c)
    print("_________________________________")
    train_ratings = ratings.filter(lambda r: r.user >= bound)
    test_ratings = ratings.filter(lambda r: r.user < bound)
    tmp = train_ratings.count()
    print("Training Data: %6.2f%% %d"%(float(tmp)/c*100, tmp))
    tmp = test_ratings.count()
    print("    Test Data: %6.2f%% %d"%(float(tmp)/c*100, tmp))
    return train_ratings, test_ratings

In [10]:
train_ratings, test_ratings = splitRatings(ratings)

Number of Reviews: 2685066
_________________________________
Training Data:  99.86% 2681301
    Test Data:   0.14% 3765


## Train Model

In [11]:
# Build the recommendation model using Alternating Least Squares
def cf_als(ratings, ALS_setting):
    print("Training CF Model")
    print("_________________________________")
    start_time = time.time()
    model = ALS.train(ratings, ALS_setting['rank'], ALS_setting['numIterations'])
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return model

def cf_als_eval(model, ratings):
    # Evaluate the model on training data
    print("Evaluating CF Model")
    print("_________________________________")
    start_time = time.time()
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    end_time = time.time()
    print("Mean Squared Error = " + str(MSE))
    print "Time Cost: %.2fs"%(end_time - start_time)

In [12]:
def saveModel(model, name):
    start_time = time.time()
    vBusiness = model.productFeatures()
    vBusiness.saveAsPickleFile(save_dir+name+"/vBusiness")
    vUser = model.userFeatures()
    vUser.saveAsPickleFile(save_dir+name+"/vUser")
    
#     # Save and load model
#     model.save(sc, save_dir+name+"/cfModel")

    end_time = time.time()
    print("Number of Business: %d"%vBusiness.count())
    print("Number of User: %d"%vUser.count())
    print "Time Cost: %.2fs"%(end_time - start_time)
    
def loadModel(name):
    start_time = time.time()
    # Load vUser
    vUser = sc.pickleFile(save_dir+name+"/vUser")
    # Load vBusiness
    vBusiness = sc.pickleFile(save_dir+name+"/vBusiness")
    # Load Model
#     model = MatrixFactorizationModel.load(sc, save_dir+name+"/cfModel")
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return vUser, vBusiness

### Builde CF Model based on training data

In [13]:
trainModel = cf_als(train_ratings, ALS_setting)

Training CF Model
_________________________________
Time Cost: 82.39s


In [13]:
cf_als_eval(trainModel, train_ratings)

Evaluating CF Model
_________________________________
Mean Squared Error = 0.201681527241
Time Cost: 134.85s


In [35]:
saveModel(trainModel, "full")

Number of Business: 85538
Number of User: 685557
Time Cost: 14.18s


### Build the complete CF Model

In [6]:
model = cf_als(ratings, ALS_setting)

Training CF Model
_________________________________
Time Cost: 177.23s


In [10]:
saveModel(model, "final")

Number of Business: 85539
Number of User: 686556
Time Cost: 14.81s


In [14]:
def getPredictedUserFeature(uid, ratings, vBusiness):
    def getMatrix(m):
        vb = []
        mA = []
        for r in m.collect():
            vb.append(r.label)
            mA.append(r.features.toArray())
        return np.array(vb), np.array(mA)
    businessDict = ratings.filter(lambda r: r.user == uid)\
                          .map(lambda r: (r.product, r.rating)).collectAsMap()
    compactM = vBusiness.filter(lambda r: r[0] in businessDict)\
                  .map(lambda r: LabeledPoint(businessDict[r[0]], r[1]))
    vb, mA = getMatrix(compactM)
    vu = np.linalg.inv(mA.T.dot(mA)).dot(mA.T).dot(vb)
    return vu, np.mean((vb-mA.dot(vu))**2)

def getKNN(vUser, vu, k):
    return vUser.map(lambda r: (r[0], np.linalg.norm(vu-r[1]))).top(k, key=lambda r: -r[1])

def getSimilarity(dv, v):
    return max(1 - dv/np.linalg.norm(v), 0)

def getUserFeature(vUser, uid):
    a = vUser.filter(lambda r: r[0] == uid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getProductFeature(vBusiness, bid):
    a = vBusiness.filter(lambda r: r[0] == bid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getRecommedProduct(vBusiness, vu, num=10):
    return map(
            lambda r: r[0],
            vBusiness.map(lambda r: (r[0], np.array(r[1]).dot(vu))).top(num, key=lambda r: r[1])
        )

def getEatingMate(uid, ratings, model, knn=10):
    # Get Feature Matrix
    vUser = trainModel.userFeatures().cache()
    vBusiness = trainModel.productFeatures().cache()
    # Get Predicted User Feature Vector
    vu, err = getPredictedUserFeature(uid, ratings, vBusiness)
    # Get Potential Recomended Business
    recomdBusinessSet = set( getRecommedProduct(vBusiness, vu, 10) )
    # Get the kNN eating mates
    knnUsers = getKNN(vUser, vu, knn)
    return map(lambda r: (
            r[0],
            getSimilarity(r[1], vu),
            filter(
                lambda r: r in recomdBusinessSet,
                getRecommedProduct(vBusiness, getUserFeature(vUser, r[0]), 10)
            )[:3]
        ), knnUsers), (vu, err)

In [15]:
def evalUserVectorPrediction(test_ratings, trainModel):
    # Get Feature Matrix
    vUser = trainModel.userFeatures().cache()
    vBusiness = trainModel.productFeatures().cache()
    sum = 0
    i = 0
    for x in test_ratings.map(lambda r: (r.user, (r.product, r.rating))).groupByKey().collect():
        if len(x[1]) < ALS_setting['rank']: continue
        vu = getPredictedUserFeature(x[0], test_ratings, vBusiness)
        if vu is None: continue
        for bid, rating in x[1]:
            vb = getProductFeature(vBusiness, bid)
            if vb is None: continue
            sum += (rating - vu.dot(vb))**2
            i+=1
        print sum / i, i

In [16]:
def evalEatingMateRecomd(uid, ratings, model):
    start_time = time.time()
    knnUsers, vec = getEatingMate(uid, ratings, model, knn=10)
    for mid, sim, recomds in knnUsers:
        print("%6d %d%% %s"%(mid, int(sim*100), recomds))
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    print "MSE of User Vector: %.2f"%vec[1]

In [19]:
vUser = trainModel.userFeatures().cache()
vBusiness = trainModel.productFeatures().cache()
start_time = time.time()
print "Number of Users   : %d"%vUser.count()
print "Number of Business: %d"%vBusiness.count()
end_time = time.time()
print "Time Cost: %.2fs"%(end_time - start_time)

Number of Users   : 685557
Number of Business: 85538
Time Cost: 8.75s


## New User - Provide a series of ratings

In [17]:
# Only show the first 10 records
test_ratings.filter(lambda r: r[0] == 400).map(lambda r: (r[0], r[1], r[2])).collect()[:10]

[(400, 1899, 4.0),
 (400, 12608, 2.0),
 (400, 12897, 2.0),
 (400, 12912, 5.0),
 (400, 12971, 3.0),
 (400, 12988, 5.0),
 (400, 13139, 3.0),
 (400, 13182, 3.0),
 (400, 13225, 5.0),
 (400, 13266, 3.0)]

## Linear Regression

In [21]:
vu, err = getPredictedUserFeature(400, test_ratings, vBusiness)

### Predicted User Vector

In [22]:
vu

array([ 0.70494235,  0.82374822,  0.04961392, -1.00127745, -0.20855774,
       -0.54137144,  0.69599332, -0.9910483 ,  0.1960942 , -0.05883268])

### Mean Squared Error

In [23]:
err

0.68545587419692322

## K-NN

In [24]:
getKNN(vUser, vu, 10)

[(549160, 0.43512965521510033),
 (173008, 0.48724432486827735),
 (379400, 0.51627669683277222),
 (603259, 0.55394767594551075),
 (588107, 0.55601589082649205),
 (325203, 0.56162346541068553),
 (63650, 0.57767919317913008),
 (390399, 0.58395030948810156),
 (373084, 0.58520336223113201),
 (20616, 0.58958762416473243)]

## Eating Mates & Recommended Restaurants

In [41]:
evalEatingMateRecomd(400, test_ratings, trainModel)

 94726 75% [81330, 64660, 79749, 69512]
 86631 74% [24790, 69512, 81330, 39194, 80843, 64660, 8391]
516667 71% [81945, 81330, 79749, 24790, 64660, 8391]
484786 71% [24790, 8391, 64660, 81945, 79749, 62104, 81330]
547229 69% [64660, 81330, 80843, 62104]
278090 69% [81945, 81330, 62104, 79749, 69512, 64660, 24790, 8391]
353569 68% [64660, 8391, 24790, 62104, 81945, 79749, 80843]
523250 68% [81330, 64660, 79749, 81945, 69512, 24790]
668574 66% [81330, 79749, 24790, 81945, 64660]
493567 66% [81945, 81330, 24790, 69512]
Time Cost: 55.01s
MSE of User Vector: 0.53


## Evaluation

### Friends list of User 9

In [46]:
# Friends of User 9
friends = [2, 3, 466, 2141, 3587, 4043, 12840, 13866, 22447, 23636, 208888, 213233, 83682, 164654, 449598, 651072, 272274, 350958, 313860, 335183, 346899, 360696, 172400, 382269, 165575, 392534, 493566, 479112, 606539, 651724, 103245, 676122, 642526, 498303, 520182, 602193, 605821, 656609, 287720, 462134, 81991, 227902, 219928, 220376, 497194, 542947, 674672, 21094, 171797, 208358, 419478, 230047, 178452, 566913, 673854, 140296, 438499, 448999, 476155, 376394, 614785, 351478, 558378, 558666, 130302, 522346, 530471, 615818, 667789, 443120, 504583, 6751, 27953, 132705, 115644, 116184, 462175, 491307, 346313]
len(friends)

79

### Count the similarity between user and his friends

In [47]:
vu, err = getPredictedUserFeature(9, test_ratings, vBusiness)
sims = []
for fid in friends:
    if fid <= 1000: continue
    vf = getUserFeature(vUser, fid)
    dv = np.linalg.norm(vu-vf)
    sims.append( int(100 * getSimilarity(dv, vu)) )
print sims

[27, 57, 67, 25, 37, 22, 0, 31, 0, 0, 12, 0, 40, 46, 7, 0, 51, 44, 0, 0, 28, 2, 0, 0, 12, 28, 0, 47, 9, 3, 38, 26, 20, 0, 11, 6, 0, 21, 11, 20, 0, 6, 0, 30, 20, 0, 26, 0, 0, 0, 15, 0, 0, 45, 0, 19, 0, 0, 13, 20, 30, 16, 1, 0, 29, 2, 0, 14, 0, 0, 0, 0, 0, 10, 9, 0]


In [48]:
hist = np.histogram(sims, np.arange(0, 110, 10))
hist

(array([40, 10, 13,  5,  5,  2,  1,  0,  0,  0]),
 array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100]))

### Histogram

In [49]:
for i in range(10):
    print "%2d ~ %3d : %6.2f%%"%(i*10, (i+1)*10, 100*hist[0][i]/79)

 0 ~  10 :  50.00%
10 ~  20 :  12.00%
20 ~  30 :  16.00%
30 ~  40 :   6.00%
40 ~  50 :   6.00%
50 ~  60 :   2.00%
60 ~  70 :   1.00%
70 ~  80 :   0.00%
80 ~  90 :   0.00%
90 ~ 100 :   0.00%


# Filter

In [40]:
vUser = trainModel.userFeatures().cache()
vBusiness = trainModel.productFeatures().cache()

In [45]:
user_id_map = {}
business_id_map = {}

with open("data/user_map.jl", 'r') as f:
    for line in f:
        [lid, sit] = json.loads(line)
        user_id_map[sit] = lid
f.close()

with open("data/business_map.jl", 'r') as f:
    for line in f:
        [lid, sit] = json.loads(line)
        business_id_map[sit] = lid
f.close()

In [56]:
def searchForUser(uid):
    lid = user_id_map[uid]
    res = {}
    with open("data/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json", 'r') as f:
        for line in f:
            tmp = json.loads(line)
            if lid == tmp['user_id']:
                res = tmp
                break
    f.close()
    return res

def searchForBusiness(uid):
    lid = user_id_map[uid]
    res = {}
    with open("data/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json", 'r') as f:
        for line in f:
            tmp = json.loads(line)
            if lid == tmp['user_id']:
                res = tmp
                break
    f.close()
    return res

In [59]:
searchForUser(9)

{u'average_stars': 3.59,
 u'compliments': {u'cool': 5,
  u'cute': 1,
  u'funny': 8,
  u'hot': 5,
  u'more': 2,
  u'note': 3,
  u'photos': 1,
  u'plain': 5,
  u'profile': 1,
  u'writer': 3},
 u'elite': [],
 u'fans': 7,
 u'friends': [u'rpOyqD_893cqmDAtJLbdog',
  u'4U9kSBLuBDU391x6bxU-YA',
  u'Re447krbp0VQVX3Dk4Tsog',
  u'kJc9YBRwmmZ_PG0uLHuEPQ',
  u'Rir-YRPPClKXDFQbc3BsVw',
  u'zTWH9b_ItSdLOK9ypeFOIw',
  u'eCSCS17Y7Ie-_a6RmQgUJg',
  u'ZZ6WSb99R7kVHfFW8XG6TA',
  u'CA1CSw_uiapPotkJC9hXAQ',
  u'LdcJrVMG-oP_V9_jZGprfQ',
  u'R7-OHW_M_V4lYGbIT2hetQ',
  u'2TN0paDJnWx4FmYClFdErQ',
  u'JT5kM6wRwh7cNt8IAuX_sw',
  u'RZwkUvViHYEh5Z65--cVZw',
  u'Ie7tbGC1cG2OLfAaNqs62A',
  u'u00EOkApqyRP9YLlI90TKA',
  u'psf7Etrt3azIdhMhoPKmpA',
  u'h_TlJAgBfSXG4Yw7rkC1Kg',
  u'BfCysDOVY3lV6WSEYb92EQ',
  u'Mx-vxv_V-SQCe76w4RmUfA',
  u'WnI89yU2-swqftb0PW-0xQ',
  u'N7gupqGZWhTYlTr_wuMI1Q',
  u'9Ecy5YOkAyDv42tSNRxAlg',
  u'FK5idBTeUebFuvfQMSSVJg',
  u'LyQjgcpgL2rz_QH3cLFz8Q',
  u'2uk4R74YwMeBQFFnWj5HQQ',
  u'lXhOAP6ncg9E