In [2]:
from pyspark.mllib.recommendation import Rating

In [3]:
import numpy as np
import json, time
from operator import add

In [4]:
# Load and parse the data
data = sc.textFile("data/reviews.jl")
ratings = data.map(lambda l: json.loads(l))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

In [5]:
business_dict = sc.textFile("data/business.jl").map(lambda r: json.loads(r))\
                .map(lambda  r: (r[0], (r[1], r[2]))).collectAsMap()

user_dict = sc.textFile("data/user.jl").map(lambda r: json.loads(r))\
                .map(lambda  r: (r[0], r[1])).collectAsMap()

In [14]:
def loadModel():
    start_time = time.time()
    # Load vUser
    vUser = sc.pickleFile("target/final/vUser")
    # Load vBusiness
    vBusiness = sc.pickleFile("target/final/vBusiness")
    # Load Model
#     model = MatrixFactorizationModel.load(sc, save_dir+name+"/cfModel")
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return vUser, vBusiness

vUser, vBusiness = loadModel()

Time Cost: 0.12s


In [15]:
vUser.cache()
vBusiness.cache()

MapPartitionsRDD[22] at objectFile at NativeMethodAccessorImpl.java:-2

In [16]:
print vUser.count()
print vBusiness.count()

686556
85539


In [23]:
def getKNN(vUser, vu, k):
    return vUser.map(lambda r: (r[0], np.linalg.norm(vu-r[1]))).top(k, key=lambda r: -r[1])

def getUserFeature(vUser, uid):
    a = vUser.filter(lambda r: r[0] == uid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getProductFeature(vBusiness, bid):
    a = vBusiness.filter(lambda r: r[0] == bid)
    if a.isEmpty(): return None
    return np.array(a.first()[1])

def getRecommedProduct(vBusiness, vu, num=10):
    return map(
            lambda r: r[0],
            vBusiness.filter(lambda r: r[0] in business_dict)\
                .map(lambda r: (r[0], np.array(r[1]).dot(vu))).top(num, key=lambda r: r[1])
        )

def getTaste(uid):
    start_time = time.time()
    vu = getUserFeature(vUser, uid)
    if vu is None: return [], [], []
    res = getRecommedProduct(vBusiness, vu, 1000)
    recomd = map(lambda r: business_dict[r][0], res[:5])
    users = map(lambda r: user_dict[r[0]], getKNN(vUser, vu, 6)[1:] )
    taste = sc.parallelize(res).flatMap(lambda r: business_dict[r][1])\
        .map(lambda r: (r, 1)).reduceByKey(add).takeOrdered(50, key=lambda r: -r[1])
    end_time = time.time()
    print "Time Cost: %.2fs"%(end_time - start_time)
    return recomd, users, taste

In [26]:
with open('cache.jl', "a") as f:
    for i in range(1, 11):
        print "Start Calculation On %d"%i
        r,u,t = getTaste(i)
        f.write( json.dumps( [i, r, u, t] ) )
        f.write('\n')
        f.flush()
f.close()

Start Calculation On 1
Time Cost: 14.54s
Start Calculation On 2
Time Cost: 14.00s
Start Calculation On 3
Time Cost: 13.31s
Start Calculation On 4
Time Cost: 11.41s
Start Calculation On 5
Time Cost: 15.73s
Start Calculation On 6
Time Cost: 17.02s
Start Calculation On 7
Time Cost: 15.03s
Start Calculation On 8
Time Cost: 11.76s
Start Calculation On 9
Time Cost: 13.33s
Start Calculation On 10
Time Cost: 13.72s


In [27]:
with open('cache.jl', "a") as f:
    for i in range(11, 21):
        print "Start Calculation On %d"%i
        r,u,t = getTaste(i)
        f.write( json.dumps( [i, r, u, t] ) )
        f.write('\n')
        f.flush()
f.close()

Start Calculation On 11
Time Cost: 14.51s
Start Calculation On 12
Time Cost: 11.71s
Start Calculation On 13
Time Cost: 14.67s
Start Calculation On 14
Time Cost: 13.64s
Start Calculation On 15
Time Cost: 12.95s
Start Calculation On 16
Time Cost: 11.27s
Start Calculation On 17
Time Cost: 12.79s
Start Calculation On 18
Time Cost: 12.79s
Start Calculation On 19
Time Cost: 13.28s
Start Calculation On 20
Time Cost: 11.76s


In [28]:
with open('cache.jl', "a") as f:
    for i in range(21, 31):
        print "Start Calculation On %d"%i
        r,u,t = getTaste(i)
        f.write( json.dumps( [i, r, u, t] ) )
        f.write('\n')
        f.flush()
f.close()

Start Calculation On 21
Time Cost: 13.86s
Start Calculation On 22
Time Cost: 13.06s
Start Calculation On 23
Time Cost: 13.56s
Start Calculation On 24
Time Cost: 11.32s
Start Calculation On 25
Time Cost: 12.84s
Start Calculation On 26
Time Cost: 15.51s
Start Calculation On 27
Time Cost: 13.06s
Start Calculation On 28
Time Cost: 11.33s
Start Calculation On 29
Time Cost: 17.19s
Start Calculation On 30
Time Cost: 13.62s


In [29]:
with open('cache.jl', "a") as f:
    for i in range(31, 41):
        print "Start Calculation On %d"%i
        r,u,t = getTaste(i)
        f.write( json.dumps( [i, r, u, t] ) )
        f.write('\n')
        f.flush()
f.close()

Start Calculation On 31
Time Cost: 15.55s
Start Calculation On 32
Time Cost: 11.37s
Start Calculation On 33
Time Cost: 12.74s
Start Calculation On 34
Time Cost: 13.10s
Start Calculation On 35
Time Cost: 12.94s
Start Calculation On 36
Time Cost: 11.31s
Start Calculation On 37
Time Cost: 12.82s
Start Calculation On 38
Time Cost: 13.11s
Start Calculation On 39
Time Cost: 12.76s
Start Calculation On 40
Time Cost: 11.13s


In [30]:
with open('cache.jl', "a") as f:
    for i in range(41, 61):
        print "Start Calculation On %d"%i
        r,u,t = getTaste(i)
        f.write( json.dumps( [i, r, u, t] ) )
        f.write('\n')
        f.flush()
f.close()

Start Calculation On 41
Time Cost: 13.06s
Start Calculation On 42
Time Cost: 13.19s
Start Calculation On 43
Time Cost: 13.04s
Start Calculation On 44
Time Cost: 11.38s
Start Calculation On 45
Time Cost: 13.06s
Start Calculation On 46
Time Cost: 13.98s
Start Calculation On 47
Time Cost: 12.83s
Start Calculation On 48
Time Cost: 11.16s
Start Calculation On 49
Time Cost: 12.81s
Start Calculation On 50
Time Cost: 12.82s
Start Calculation On 51
Time Cost: 12.78s
Start Calculation On 52
Time Cost: 11.45s
Start Calculation On 53
Time Cost: 13.47s
Start Calculation On 54
Time Cost: 13.72s
Start Calculation On 55
Time Cost: 12.88s
Start Calculation On 56
Time Cost: 11.40s
Start Calculation On 57
Time Cost: 12.78s
Start Calculation On 58
Time Cost: 12.93s
Start Calculation On 59
Time Cost: 12.82s
Start Calculation On 60
Time Cost: 11.13s
