# Chương trình tính dự đoán rating trên dữ liệu Neo4j

## Hai độ đo tương tự sử dụng là Cosin và Pearson

In [None]:
import math
from py2neo import Graph, Node
graph=Graph("bolt://localhost:7687",auth=("neo4j", "123"))

## Dự đoán rating bằng độ đo Cosin

In [None]:
def predictRatingCosin(user, movie):
    rec = graph.run(
                   # Tính số movie được u1 rated và u2 rated. Sau đó tính độ tương tự Cosin
                    'MATCH (u1:User {id:$userid})-[x:RATED]->(m:Movie)<-[y:RATED]-(u2:User) '
                    'WITH COUNT(m) AS numbermovies, SUM(x.rating * y.rating) AS xyDotProduct,'
                    'SQRT(REDUCE(xDot = 0.0, a IN COLLECT(x.rating) | xDot + a^2)) AS xLength,'
                    'SQRT(REDUCE(yDot = 0.0, b IN COLLECT(y.rating) | yDot + b^2)) AS yLength, '
                    'u1, u2 WHERE numbermovies > 5 '
                    'WITH u1, u2, CASE WHEN xLength*yLength>0 THEN xyDotProduct/(xLength * yLength) ELSE 0 END AS cosin WHERE cosin > 0.5 '
                    'MATCH (u2)-[r:RATED]->(m:Movie{id:$movieid})'
                    'RETURN CASE WHEN SUM(cosin)>0 THEN SUM(cosin * r.rating)/SUM(cosin) ELSE 0 END  AS score ', userid=user, movieid = movie)
    return rec.evaluate()

## Gợi ý phim cho user bằng độ đo Cosin lấy top k phim

In [None]:
def recommendCosin(user, K):
    rec = graph.run(
                   # Tính số movie được u1 rated và u2 rated. Sau đó tính độ tương tự Cosin
                    'MATCH (u1:User {id:$userid})-[x:RATED]->(m:Movie)<-[y:RATED]-(u2:User) '
                    'WITH COUNT(m) AS numbermovies, SUM(x.rating * y.rating) AS xyDotProduct,'
                    'SQRT(REDUCE(xDot = 0.0, a IN COLLECT(x.rating) | xDot + a^2)) AS xLength,'
                    'SQRT(REDUCE(yDot = 0.0, b IN COLLECT(y.rating) | yDot + b^2)) AS yLength, '
                    'u1, u2 WHERE numbermovies > 5 '
                    'WITH u1, u2, CASE WHEN xLength*yLength>0 THEN xyDotProduct/(xLength * yLength) ELSE 0 END AS cosin WHERE cosin > 0.5 '
                    'MATCH (u2)-[r:RATED]->(m:Movie) WHERE NOT EXISTS( (u1)-[:RATED]->(m) )'
                    'RETURN m.id, SUM(cosin * r.rating)/SUM(cosin) AS score ORDER BY score DESC LIMIT $K', userid=user, K = K)
    return rec.to_data_frame()

## Dự đoán rating bằng độ đo Pearson

In [None]:
def predictRatingPearson(user, movie):
    #rec = None
    rec = graph.run(
                   '''MATCH (u1:User {id:$userid})-[r1:RATED]->(m:Movie)
                    WITH u1, avg(r1.rating) AS u1_mean, gds.alpha.similarity.asVector(m, r1.rating) AS u1Vector
                    MATCH (u2:User)-[r2:RATED]->(m:Movie) WHERE u2 <> u1
                    WITH u1, u1Vector, u1_mean, u2, avg(r2.rating) AS u2_mean, gds.alpha.similarity.asVector(m, r2.rating) AS u2Vector
                    WHERE size(apoc.coll.intersection([v in u1Vector | v.category], [v in u2Vector | v.category])) > 5
                    WITH u1, u2, u1_mean, u2_mean, gds.alpha.similarity.pearson(u1Vector, u2Vector, {vectorType: "maps"}) AS pearson
                    WHERE pearson > 0.1
                    MATCH (u2)-[r:RATED]->(m:Movie{id:$movieid}) 
                    RETURN (u1_mean + SUM(pearson * (r.rating - u2_mean))/SUM(pearson)) AS score''', userid=user, movieid=movie) 
    return rec.evaluate()

## Gợi ý phim cho user sử dụng độ đo Pearson lấy top k phim

In [None]:
def recommendPearson(user, K):
    rec = graph.run(
                   '''MATCH (u1:User {id:$userid})-[r1:RATED]->(m:Movie)
                    WITH u1, avg(r1.rating) AS u1_mean, gds.alpha.similarity.asVector(m, r1.rating) AS u1Vector
                    MATCH (u2:User)-[r2:RATED]->(m:Movie) WHERE u2 <> u1
                    WITH u1, u1Vector, u1_mean, u2, avg(r2.rating) AS u2_mean, gds.alpha.similarity.asVector(m, r2.rating) AS u2Vector
                    WHERE size(apoc.coll.intersection([v in u1Vector | v.category], [v in u2Vector | v.category])) > 5
                    WITH u1, u2, u1_mean, u2_mean, gds.alpha.similarity.pearson(u1Vector, u2Vector, {vectorType: "maps"}) AS pearson
                    WHERE pearson > 0.1
                    MATCH (u2)-[r:RATED]->(m:Movie) WHERE NOT EXISTS( (u1)-[:RATED]->(m) )
                    RETURN m.id, (u1_mean + SUM(pearson * (r.rating - u2_mean))/SUM(pearson)) AS score ORDER BY score DESC LIMIT $K''', userid=user, K = K)
    return rec.to_data_frame()

# Đánh giá độ chính xác

## Lưu dự đoán vào Neo4j

In [None]:
def listUM():
    rec = graph.run('MATCH (u:User)-[r:RATED_LATE]->(m:Movie) RETURN u.id as userid, m.id as movieid, r.rating as rating, r.predCosin as predCosin, r.predPearson as predPearson')
    df = rec.to_data_frame()
    return df

def predictData():
    df = listUM()
    for i in range(len(df)):
        uid = df.iloc[i].userid
        mid = df.iloc[i].movieid
        #print(uid, mid)
        predP = predictRatingPearson(uid, mid)
        predC = predictRatingCosin(uid, mid)
        if pred == None:
            pred = 0
        graph.run('''MATCH (u:User{id:$userid})-[r:RATED_LATE]->(m:Movie{id:$movieid}) SET r.predPearson = $predP
                  SET r.predCosin = $predC''', userid=uid, movieid = mid, predP = predP, predC = predC)

## Độ chính xác dự đoán rating: MAE, RMSE

In [None]:
def mae_rmse():
    df = listUM()
    s1 = 0
    s2 = 0
    for i in range(len(df)):
        tg = abs(df.iloc[i].rating-df.iloc[i].predPearson)
        ttg = tg * tg
        s1 = s1 + tg
        s2 = s2 + ttg
    mae = s1/len(df)
    rmse = math.sqrt(s2/len(df))
    return mae, rmse
a, b = mae_rmse()
print(a, b)

## Độ chính xác Precision, Recall, F1 Score

In [None]:
def precision_recall():
    df = listUM()
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(df)):
        r = df.iloc[i].rating
        p = df.iloc[i].predPearson
        if r >=3 and p >=3: TP = TP + 1
        if r >=3 and p < 3: FN = FN + 1
        if r < 3 and p >=3: FP = FP + 1
        if r < 3 and p < 3: TN = TN + 1
    pre = TP/(TP + FP)
    rec = TP/(TP + FN)
    f1 = 2*pre*rec/(pre+rec)
    return pre, rec, f1