In [1]:
import os 
import pandas as pd 
import numpy as np
from pathlib import Path 
import flask
import datetime as dt

# read data, just change the path to the place where you store this repo
path = 'D:\Giáo Trình\Kỳ 5\Kho và Khai Phá Dữ Liệu\Project\Web App'
customerListId = pd.read_csv(os.path.join(path, 'Data//customer_id.csv'))
productListId = pd.read_csv(os.path.join(path, 'Data//product_id.csv'))
prodRules = pd.read_csv(os.path.join(path, 'Data//prod_rules.csv'))
customerSegment = pd.read_csv(os.path.join(path, 'Data//rfm.csv'))
recommendData = pd.read_csv(os.path.join(path, 'Data//recommendData.csv'))
rawRating = Path(os.path.join(path, 'Data//spare_matrix.txt')).read_text()
rawSimilarity = Path(os.path.join(path, 'Data//similarity.txt')).read_text()

class system(object) : 
    def __init__(self, cusId, rawRating = rawRating, rawSimilarity = rawSimilarity, 
                productListId = productListId.values.tolist(), neighbor = 10) :
        self.ratingData = self.dataProcessing(rawRating)
        self.similarity = self.cleanSimilarity(self.dataProcessing(rawSimilarity))
        self.neighbor = neighbor
        self.productListId = productListId
        self.predictedRating = self.predictRating(cusId, self.similarity, self.ratingData, self.neighbor)
        self.recommend = self.recommendation(self.predictedRating)

    # clean review_score sparse matrix and change its data type to 'list' 
    def dataProcessing(self, data) :
        rawRating = data
        rawRating = rawRating.replace('\n', '')
        rawRating = rawRating.replace('\t', '')
        rawRating = rawRating.replace(' ', '')
        rawRating = rawRating.replace(':', '')
        ratingData = []
        mark = 1
        temp = []
        for i in range(1, len(rawRating)) :
            if rawRating[i] == '(' : 
                temp.append(round(float(rawRating[mark: i]), 2))
                ratingData.append(temp)
                temp = []
                mark = i + 1
            elif rawRating[i] == ',' : 
                temp.append(int(rawRating[mark: i]))
                mark = i + 1
            elif rawRating[i] == ')' : 
                temp.append(int(rawRating[mark: i]))
                mark = i + 1
            if i == len(rawRating) - 1 : 
                temp.append(round(float(rawRating[mark: i]), 2))
                ratingData.append(temp)
        return ratingData
    
    # clean similarity sparse matrix and change its data type to 'list' 
    # remove all items which don't have similarity with other items
    def cleanSimilarity(self, data) : 
        rawSimi = data
        similarity = {}
        for i in range(len(rawSimi)) : 
            if rawSimi[i][0] != rawSimi[i][1] : 
                if rawSimi[i][0] not in similarity :
                    similarity[rawSimi[i][0]] = {rawSimi[i][1] : rawSimi[i][2]}
                else : 
                    similarity[rawSimi[i][0]][rawSimi[i][1]] = rawSimi[i][2]
        return similarity
    
    # check if item rating by input user is available in cleaned similarity matrix 
    # if not : remove it and go on 
    def checkCustomerId(self, similarity, ratedByCus) :
        temp = {}
        for i in ratedByCus : 
            if i in similarity : 
                temp[i] = ratedByCus[i]
        return temp

    # predict rating based on ratings of K items which have the highest similarity with rated items
    # return list items sorted by predicted rating in descending order
    def predictRating(self, cusId, similarity, ratingData, neighbor) :
        ratedByCus = {}
        for i in range(len(ratingData)) : 
            if ratingData[i][1] == cusId : 
                ratedByCus[ratingData[i][0]] = ratingData[i][2]
        ratedByCus = self.checkCustomerId(similarity, ratedByCus) 
        if len(ratedByCus) == 0 : 
            return []
        potentialItem = set()
        for i in ratedByCus : 
            for y in similarity[i] : 
                if y not in ratedByCus : 
                    potentialItem.add(y)
        predictedRating = {}
        for i in potentialItem : 
            check = dict(sorted(similarity[i].items(), key=lambda item: item[1], reverse = True))
            neighborAmount = min(neighbor, len(check))
            numerator = 0
            denomerator = 0
            count = 0 
            for y in check : 
                if y in ratedByCus : 
                    numerator += (ratedByCus[y] * check[y])
                denomerator += abs(check[y])
                count += 1
                if count == neighborAmount : 
                    break
            if denomerator == 0 :
                continue
            predictedRating[i] = round(numerator / denomerator, 2)
        return dict(sorted(predictedRating.items(), key=lambda item: item[1], reverse = True))

    # decode alternative product id and return its real id
    def decodeProductId(self, alternativeId) : 
        dictionary = self.productListId
        productId = []
        for i in alternativeId : 
            for y in range(len(dictionary)) : 
                if dictionary[y][1] == i : 
                    productId.append(dictionary[y][0])
                    break
        return productId

    # get first 5 item 
    def recommendation(self, predictedRating) : 
        return self.decodeProductId(list(predictedRating)[:5])

In [2]:
test = {}
for i in range(46822) : 
    a = system(i).predictedRating
    if len(a) == 0 : 
        continue
    sumA = 0 
    for y in a : 
        sumA += a[y]
    test[i] = round(sumA / len(a))

In [3]:
import math
sumTest = 0
sumRMSE = 0
a = system(0).ratingData
for i in test : 
    for y in range(len(a)) : 
        if a[y][1] == i : 
            sumTest += round(a[y][2] - test[i])
            sumRMSE += round(a[y][2] - test[i]) ** 2
            break

In [4]:
print('MAE:', sumTest / len(test))
print('RMSE:', math.sqrt(sumRMSE / len(test)))
print('MSE:', sumRMSE / len(test))

MAE: 1.3793103448275863
RMSE: 1.8382900600361158
MSE: 3.3793103448275863
