In [1]:
import random
import math
import operator
import pandas as pd

In [2]:
file = open('user_taggedbookmarks-timestamps.dat')

In [3]:
data = pd.read_csv('user_taggedbookmarks-timestamps.dat', delimiter='\t')

In [8]:
data.head()

Unnamed: 0,userID,bookmarkID,tagID,timestamp
0,8,1,1,1289255362000
1,8,2,1,1289255159000
2,8,7,1,1289238901000
3,8,7,6,1289238901000
4,8,7,7,1289238901000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437593 entries, 0 to 437592
Data columns (total 4 columns):
userID        437593 non-null int64
bookmarkID    437593 non-null int64
tagID         437593 non-null int64
timestamp     437593 non-null int64
dtypes: int64(4)
memory usage: 13.4 MB


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437593 entries, 0 to 437592
Data columns (total 4 columns):
userID        437593 non-null int64
bookmarkID    437593 non-null int64
tagID         437593 non-null int64
timestamp     437593 non-null int64
dtypes: int64(4)
memory usage: 13.4 MB


In [9]:
class SimpleTagBased():
    def __init__(self, filename):
        self.filename = filename
        self.load_data()
        self.random_split_data(0.2)
        self.init_state()
        self.test_recommend()
    
    def load_data(self):
        print('loading data.....')
        filename = self.filename
        self.records = {}
        data = pd.read_csv(filename, delimiter='\t')
        for index, row in data.iterrows():
            # user for some book setting tag
            uid, bid, tid = row['userID'], row['bookmarkID'], row['tagID']
            # graph or tree
            self.records.setdefault(uid, {})
            self.records[uid].setdefault(bid, [])
            self.records[uid][bid].append(tid)
        print('Size of Data Set', data.shape[0])
        print('Number of person who sets tags', len(self.records))
        print('Finish Loading Data')
        
    def random_split_data(self, ratio, seed=2020):
        print('spliting data randomly.....')
        random.seed(seed)
        # build user-book according to tag
        self.train = {}
        self.test = {}
        for user in self.records.keys():
            for book in self.records[user].keys():
                if random.random() < ratio:
                    self.test.setdefault(user, {})
                    self.test[user].setdefault(book, [])
                    for tag in self.records[user][book]:
                        self.test[user][book].append(tag)
                else:
                    self.train.setdefault(user, {})
                    self.train[user].setdefault(book, [])
                    for tag in self.records[user][book]:
                        self.train[user][book].append(tag)
        print('Size of train ', len(self.train), '\nSize of test ', len(self.test))
        
    def init_state(self):
        print('initing the mainly variable.......')
        records = self.train
        self.user_tags = {}
        self.tag_books = {}
        self.user_books = {}
        for user, books in records.items():
            for book, tags in books.items():
                for tag in tags:
                    # The relationship between user and tag
                    self._add_value2mat(self.user_tags, user, tag, 1)
                    # the relationship between tag and item
                    self._add_value2mat(self.tag_books, tag, book, 1)
                    # the relationship between user and item
                    self._add_value2mat(self.user_books, user, book, 1)
        print('user_tags, tag_books, user_books initialize finish.')
        print('size of user_tags is {}, size of tag_books is {}, size of user_books is {}.'.format(len(self.user_tags), len(self.tag_books), len(self.user_books)))
    
    def _add_value2mat(self, dic, key, item, value=1):
        # build the number of key-value
        if key not in dic:
            dic.setdefault(key, {})
            dic[key].setdefault(item, value)
        else:
            if item not in dic[key]:
                dic[key][item] = value
            else:
                dic[key][item] += value
        
    
    def precision_and_recall(self, N):
        print('calcing the precision and recall......')
        hit = 0
        h_recall = 0
        h_precision = 0
        for user, books in self.test.items():
            # if user not in trainset, we can't get some important things
            if user not in self.train:
                continue
            rank = self.recommend(user, N)
            for book, rui in rank:
                if book in books:
                    hit += 1
            h_recall += len(books)
            h_precision += N
        return 1. * hit / h_precision, 1. * hit / h_recall
    # get Top N
    def recommend(self, user, N):
#         print('recommending top-N for user.........')
        recommend_books = {}
        # scored for book, scores for sum (times of user using tag, multipy times of book using the same tag.)
        tagged_books= self.user_books[user]
        for tag, wut in self.user_tags[user].items():
            for book, wti in self.tag_books[tag].items():
                if book in tagged_books:
                    continue
                if book not in recommend_books:
                    recommend_books[book] = wti * wut
                else:
                    recommend_books[book] += wti * wut
        return sorted(recommend_books.items(), key=operator.itemgetter(1), reverse=True)[0:N]

    def test_recommend(self):
        print('result for result evaluation:')
        print("%3s %10s %10s" % ('N',"precision",'recall'))
        for n in [5,10,20,40,60,80,100]:
            precision, recall = self.precision_and_recall(n)
            print(precision, recall)
            print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [None]:
stb = SimpleTagBased('user_taggedbookmarks-timestamps.dat')

loading data.....
Size of Data Set 437593
Number of person who sets tags 1867
Finish Loading Data
spliting data randomly.....
Size of train  1864 
Size of test  1775
initing the mainly variable.......
user_tags, tag_books, user_books initialize finish.
size of user_tags is 1864, size of tag_books is 36798, size of user_books is 1864.
result for result evaluation:
  N  precision     recall
calcing the precision and recall......
0.009932279909706547 0.00421072778601847
  5      0.993%      0.421%
calcing the precision and recall......


|     |  Positive   | Negative  |
|:----: |  :----:  | :----: |
|True | TP  | FP |
|False| FN  | TN |

$$Acc = \frac{TP+TN}{TP+FP+FN+TN}$$
$$Pre = \frac{TP}{TP+FP}$$
$$Recall = \frac{TP}{TP+FN}$$
$$rui = wui \times wti$$