In [1]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
import numpy as np
import json, time

In [36]:
filepath = "data/reviews.jl"
save_dir = "target/"
ALS_setting = {
    'rank': 20,
    'numIterations': 20
}

In [3]:
# Load and parse the data
data = sc.textFile(filepath)
ratings = data.map(lambda l: json.loads(l))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

In [4]:
def splitRatings(ratings, bound=1000):
    c = ratings.count()
    print("Number of Reviews: %d"%c)
    print("_________________________________")
    train_ratings = ratings.filter(lambda r: r.user >= bound)
    test_ratings = ratings.filter(lambda r: r.user < bound)
    tmp = train_ratings.count()
    print("Training Data: %6.2f%% %d"%(float(tmp)/c*100, tmp))
    tmp = test_ratings.count()
    print("    Test Data: %6.2f%% %d"%(float(tmp)/c*100, tmp))
    return train_ratings, test_ratings

In [5]:
train_ratings, test_ratings = splitRatings(ratings)

Number of Reviews: 2685066
_________________________________
Training Data:  99.86% 2681301
    Test Data:   0.14% 3765


In [6]:
# Build the recommendation model using Alternating Least Squares
def cf_als(ratings, ALS_setting):
    print("Training CF Model")
    print("_________________________________")
    start_time = time.time()
    model = ALS.train(ratings, ALS_setting['rank'], ALS_setting['numIterations'])
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return model

def cf_als_eval(model, ratings):
    # Evaluate the model on training data
    print("Evaluating CF Model")
    print("_________________________________")
    start_time = time.time()
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    end_time = time.time()
    print("Mean Squared Error = " + str(MSE))
    print "Time Cost: %.2fs"%(end_time - start_time)

In [7]:
def saveModel(model, name):
    start_time = time.time()
    vBusiness = model.productFeatures()
    vBusiness.saveAsPickleFile(save_dir+name+"/vBusiness")
    vUser = model.userFeatures()
    vUser.saveAsPickleFile(save_dir+name+"/vUser")
    
#     # Save and load model
#     model.save(sc, save_dir+name+"/cfModel")

    end_time = time.time()
    print("Number of Business: %d"%vBusiness.count())
    print("Number of User: %d"%vUser.count())
    print "Time Cost: %.2fs"%(end_time - start_time)
    
def loadModel(name):
    start_time = time.time()
    # Load vUser
    vUser = sc.pickleFile(save_dir+name+"/vUser")
    # Load vBusiness
    vBusiness = sc.pickleFile(save_dir+name+"/vBusiness")
    # Load Model
#     model = MatrixFactorizationModel.load(sc, save_dir+name+"/cfModel")
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return vUser, vBusiness

### Builde CF Model based on training data

In [8]:
trainModel = cf_als(train_ratings, ALS_setting)

Training CF Model
_________________________________
Time Cost: 67.94s


In [9]:
cf_als_eval(trainModel, train_ratings)

Evaluating CF Model
_________________________________
Mean Squared Error = 0.202851587955
Time Cost: 117.99s


In [35]:
saveModel(trainModel, "full")

Number of Business: 85538
Number of User: 685557
Time Cost: 14.18s


### Build the complete CF Model

In [13]:
model = cf_als(ratings, ALS_setting)

Training CF Model
_________________________________
Time Cost: 56.76s
Evaluating CF Model
_________________________________
Mean Squared Error = 0.201828733154
Time Cost: 119.29s


In [37]:
def getPredictedUserFeature(uid, ratings, vBusiness):
    def getMatrix(m):
        vb = []
        mA = []
        for r in m.collect():
            vb.append(r.label)
            mA.append(r.features.toArray())
        return np.array(vb), np.array(mA)
    businessDict = ratings.filter(lambda r: r.user == uid)\
                          .map(lambda r: (r.product, r.rating)).collectAsMap()
    compactM = vBusiness.filter(lambda r: r[0] in businessDict)\
                  .map(lambda r: LabeledPoint(businessDict[r[0]], r[1]))
    vb, mA = getMatrix(compactM)
    vu = np.linalg.inv(mA.T.dot(mA)).dot(mA.T).dot(vb)
    return vu, np.mean((vb-mA.dot(vu))**2)

def getKNN(vUser, vu, k):
    return vUser.map(lambda r: (r[0], np.linalg.norm(vu-r[1]))).top(k, key=lambda r: -r[1])

def getSimilarity(dv, v):
    return max(1 - dv/np.linalg.norm(v), 0)

def getUserFeature(vUser, uid):
    a = vUser.filter(lambda r: r[0] == uid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getProductFeature(vBusiness, bid):
    a = vBusiness.filter(lambda r: r[0] == bid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getRecommedProduct(vBusiness, vu, num=10):
    return map(
            lambda r: r[0],
            vBusiness.map(lambda r: (r[0], np.array(r[1]).dot(vu))).top(num, key=lambda r: r[1])
        )

def getEatingMate(uid, ratings, model, knn=10):
    # Get Feature Matrix
    vUser = trainModel.userFeatures().cache()
    vBusiness = trainModel.productFeatures().cache()
    # Get Predicted User Feature Vector
    vu, err = getPredictedUserFeature(uid, ratings, vBusiness)
    # Get Potential Recomended Business
    recomdBusinessSet = set( getRecommedProduct(vBusiness, vu, 10) )
    # Get the kNN eating mates
    knnUsers = getKNN(vUser, vu, knn)
    return map(lambda r: (
            r[0],
            getSimilarity(r[1], vu),
            filter(
                lambda r: r in recomdBusinessSet,
                getRecommedProduct(vBusiness, getUserFeature(vUser, r[0]), 10)
            )[:10]
        ), knnUsers), (vu, err)

In [38]:
def evalUserVectorPrediction(test_ratings, trainModel):
    # Get Feature Matrix
    vUser = trainModel.userFeatures().cache()
    vBusiness = trainModel.productFeatures().cache()
    sum = 0
    i = 0
    for x in test_ratings.map(lambda r: (r.user, (r.product, r.rating))).groupByKey().collect():
        if len(x[1]) < ALS_setting['rank']: continue
        vu = getPredictedUserFeature(x[0], test_ratings, vBusiness)
        if vu is None: continue
        for bid, rating in x[1]:
            vb = getProductFeature(vBusiness, bid)
            if vb is None: continue
            sum += (rating - vu.dot(vb))**2
            i+=1
        print sum / i, i

In [39]:
def evalEatingMateRecomd(uid, ratings, model):
    start_time = time.time()
    knnUsers, vec = getEatingMate(uid, ratings, model, knn=10)
    for mid, sim, recomds in knnUsers:
        print("%6d %d%% %s"%(mid, int(sim*100), recomds))
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    print "MSE of User Vector: %.2f"%vec[1]

In [17]:
evalEatingMateRecomd(9, test_ratings, trainModel)

111900 79% [70360, 76034, 54231, 82374, 75712, 57194]
470260 78% [70360, 78534, 83881, 76034, 83340]
 95447 77% [70360, 82374, 54231, 76034, 26317]
299306 77% [70360, 83340, 82374, 75712, 54231]
169270 77% [70360, 76034, 54231, 83340]
358501 76% [70360, 82374, 54231, 83340, 57194, 78534, 26317]
172936 76% [70360, 26317, 78534, 57194, 83340, 82374, 54231]
546037 76% [70360, 57194, 83340, 78534, 76034, 26317, 83881]
  1070 76% [54231, 78534, 70360, 76034, 82374]
475477 76% [70360, 57194, 76034, 26317]
Time Cost: 35.28s
MSE of User Vector: 1.11


In [20]:
vu, err = getPredictedUserFeature(9, test_ratings, vBusiness)
vu

array([ 0.85343739, -1.01137154, -1.61861862, -0.8111291 , -1.14654225,
       -0.84237122, -0.03597422,  0.3464966 , -0.56901162, -0.25857191])

In [22]:
getKNN(vUser, vu, 1000)

[(111900, 0.57082987474015556),
 (470260, 0.5805114246678903),
 (95447, 0.6244813045550115),
 (299306, 0.62680960603461477),
 (169270, 0.63099199680208717),
 (358501, 0.63812444296578785),
 (172936, 0.64374111614503959),
 (546037, 0.64735737398127591),
 (1070, 0.65324326022194978),
 (475477, 0.65700562683601915),
 (290485, 0.65759479191963888),
 (307522, 0.68074447818094797),
 (149768, 0.68586794299479426),
 (640456, 0.68865151508849687),
 (542746, 0.69215373082670462),
 (98338, 0.69860053901222441),
 (218658, 0.69906705052470186),
 (20158, 0.70450483702337729),
 (141264, 0.70746599011470901),
 (183302, 0.71211672048873143),
 (76204, 0.71252812624263562),
 (3717, 0.71389696117146073),
 (38020, 0.71416852079815119),
 (526974, 0.71488197561342071),
 (356024, 0.71609837428072021),
 (525487, 0.72067344540484213),
 (84378, 0.72312341543413672),
 (623396, 0.72704558971350008),
 (660649, 0.72802253962870789),
 (45472, 0.7300207453550065),
 (664352, 0.73257566284555975),
 (564632, 0.7339424894

### New User - Provide a series of ratings

In [39]:
# Only show the first 10 records
test_ratings.filter(lambda r: r[0] == 9).map(lambda r: (r[0], r[1], r[2])).collect()[:10]

[(9, 9213, 3.0),
 (9, 9487, 3.0),
 (9, 9513, 5.0),
 (9, 9659, 5.0),
 (9, 9754, 1.0),
 (9, 9811, 4.0),
 (9, 10252, 3.0),
 (9, 10345, 4.0),
 (9, 10376, 1.0),
 (9, 10641, 1.0)]

### Recommend the Eating Mates of the User

In [31]:
evalEatingMateRecomd(9, test_ratings, trainModel)

417526 82% [71885, 71912, 6359]
549161 81% [71885, 6359, 78091]
 54383 79% [71885, 71912, 6359]
607208 79% [8882, 6359, 81313]
  2617 78% [71885, 71912, 6359]
480618 77% [8882, 71885, 71912]
566543 77% [78091, 71885, 71912]
611100 77% [71885, 71912, 6359]
530940 76% [71885, 6359, 71912]
673693 76% [71885, 71912, 6359]
Time Cost: 33.87s
MSE of User Vector: 1.17


In [32]:
evalEatingMateRecomd(2, test_ratings, trainModel)

390980 70% [61226, 80604, 11401]
122758 69% [61226, 67133, 82487]
580446 68% [61226, 79202, 11401]
 89126 68% [61226, 80604, 67133]
641505 68% [61226, 73203, 67133]
104408 68% [61226, 67133, 81961]
663715 68% [61226, 81961, 67133]
230925 67% [61226, 81961, 69348]
615188 67% [61226, 73203, 11401]
531732 67% [61226, 67133, 81961]
Time Cost: 30.68s
MSE of User Vector: 0.73


### Friends list of User 9

In [34]:
# Friends of User 9
friends = [2, 3, 466, 2141, 3587, 4043, 12840, 13866, 22447, 23636, 208888, 213233, 83682, 164654, 449598, 651072, 272274, 350958, 313860, 335183, 346899, 360696, 172400, 382269, 165575, 392534, 493566, 479112, 606539, 651724, 103245, 676122, 642526, 498303, 520182, 602193, 605821, 656609, 287720, 462134, 81991, 227902, 219928, 220376, 497194, 542947, 674672, 21094, 171797, 208358, 419478, 230047, 178452, 566913, 673854, 140296, 438499, 448999, 476155, 376394, 614785, 351478, 558378, 558666, 130302, 522346, 530471, 615818, 667789, 443120, 504583, 6751, 27953, 132705, 115644, 116184, 462175, 491307, 346313]
len(friends)

79

### Count the similarity between user and his friends

In [51]:
vu, err = getPredictedUserFeature(9, test_ratings, vBusiness)
sims = []
for fid in friends:
    if fid <= 1000: continue
    vf = getUserFeature(vUser, fid)
    dv = np.linalg.norm(vu-vf)
    sims.append( int(100 * getSimilarity(dv, vu)) )
print sims

[32, 39, 58, 14, 56, 17, 13, 20, 0, 0, 5, 0, 28, 49, 19, 0, 47, 38, 0, 0, 29, 0, 0, 0, 11, 61, 0, 44, 0, 3, 11, 24, 11, 0, 0, 0, 0, 0, 8, 25, 5, 0, 0, 28, 20, 0, 18, 0, 0, 0, 36, 0, 0, 36, 0, 0, 0, 0, 16, 16, 17, 38, 0, 0, 48, 4, 0, 34, 0, 0, 0, 0, 1, 0, 0, 0]


In [63]:
hist = np.histogram(sims, np.arange(0, 110, 10))
hist

(array([44, 11,  7,  7,  4,  2,  1,  0,  0,  0]),
 array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100]))

### Histogram

In [66]:
for i in range(10):
    print "%2d ~ %3d : %6.2f%%"%(i*10, (i+1)*10, 100*hist[0][i]/79)

 0 ~  10 :  55.00%
10 ~  20 :  13.00%
20 ~  30 :   8.00%
30 ~  40 :   8.00%
40 ~  50 :   5.00%
50 ~  60 :   2.00%
60 ~  70 :   1.00%
70 ~  80 :   0.00%
80 ~  90 :   0.00%
90 ~ 100 :   0.00%


# Filter

In [40]:
vUser = trainModel.userFeatures().cache()
vBusiness = trainModel.productFeatures().cache()

In [45]:
user_id_map = {}
business_id_map = {}

with open("data/user_map.jl", 'r') as f:
    for line in f:
        [lid, sit] = json.loads(line)
        user_id_map[sit] = lid
f.close()

with open("data/business_map.jl", 'r') as f:
    for line in f:
        [lid, sit] = json.loads(line)
        business_id_map[sit] = lid
f.close()

In [56]:
def searchForUser(uid):
    lid = user_id_map[uid]
    res = {}
    with open("data/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json", 'r') as f:
        for line in f:
            tmp = json.loads(line)
            if lid == tmp['user_id']:
                res = tmp
                break
    f.close()
    return res

def searchForBusiness(uid):
    lid = user_id_map[uid]
    res = {}
    with open("data/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json", 'r') as f:
        for line in f:
            tmp = json.loads(line)
            if lid == tmp['user_id']:
                res = tmp
                break
    f.close()
    return res

In [59]:
searchForUser(9)

{u'average_stars': 3.59,
 u'compliments': {u'cool': 5,
  u'cute': 1,
  u'funny': 8,
  u'hot': 5,
  u'more': 2,
  u'note': 3,
  u'photos': 1,
  u'plain': 5,
  u'profile': 1,
  u'writer': 3},
 u'elite': [],
 u'fans': 7,
 u'friends': [u'rpOyqD_893cqmDAtJLbdog',
  u'4U9kSBLuBDU391x6bxU-YA',
  u'Re447krbp0VQVX3Dk4Tsog',
  u'kJc9YBRwmmZ_PG0uLHuEPQ',
  u'Rir-YRPPClKXDFQbc3BsVw',
  u'zTWH9b_ItSdLOK9ypeFOIw',
  u'eCSCS17Y7Ie-_a6RmQgUJg',
  u'ZZ6WSb99R7kVHfFW8XG6TA',
  u'CA1CSw_uiapPotkJC9hXAQ',
  u'LdcJrVMG-oP_V9_jZGprfQ',
  u'R7-OHW_M_V4lYGbIT2hetQ',
  u'2TN0paDJnWx4FmYClFdErQ',
  u'JT5kM6wRwh7cNt8IAuX_sw',
  u'RZwkUvViHYEh5Z65--cVZw',
  u'Ie7tbGC1cG2OLfAaNqs62A',
  u'u00EOkApqyRP9YLlI90TKA',
  u'psf7Etrt3azIdhMhoPKmpA',
  u'h_TlJAgBfSXG4Yw7rkC1Kg',
  u'BfCysDOVY3lV6WSEYb92EQ',
  u'Mx-vxv_V-SQCe76w4RmUfA',
  u'WnI89yU2-swqftb0PW-0xQ',
  u'N7gupqGZWhTYlTr_wuMI1Q',
  u'9Ecy5YOkAyDv42tSNRxAlg',
  u'FK5idBTeUebFuvfQMSSVJg',
  u'LyQjgcpgL2rz_QH3cLFz8Q',
  u'2uk4R74YwMeBQFFnWj5HQQ',
  u'lXhOAP6ncg9E