In [1]:
# usual imports
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import recsys.algorithm
recsys.algorithm.VERBOSE = True
from recsys.algorithm.factorize import SVD


In [2]:
bookRatings = pd.read_csv("BX-Book-Ratings.csv", sep = ";", quotechar = "\"")

users = pd.read_csv("BX-Users.csv", sep = ";", quotechar = "\"", escapechar = "\\")

books = pd.read_csv("BX-Books.csv", sep = ";", quotechar = "\"", escapechar = "\\")


In [3]:
def lab14():
    movies = pd.read_table('movies.dat', sep = '::', names = [ 'ITEMID', 'Title', 'Genres'], index_col = 'ITEMID')

    if False:
        print movies.head()

    ratings = pd.read_table('ratings.dat', sep = '::', names = [ 'UserID', 'MovieID', 'Rating', 'Timestamp' ])

    if False:
        print ratings.head()

    if True:
        svd = SVD()
        svd.load_data(filename = 'ratings.dat', sep = '::', format = {'col' : 0, 'row' : 1, 'value' : 2, 'ids' : int})

        k = 100
        svd.compute(k = k, min_values = 10, pre_normalize = None, mean_center = True, post_normalize = True,  savefile = '/tmp/movielens')
    else:
        svd = SVD(filename = '/tmp/movielens')

    if True:
        if False:
            movies[movies.Title == "Toy Story (1995)"]
            movies[movies.Title == "Bug's Life, A (1998)"]
        ITEMID1 = 1    # Toy Story (1995)
        ITEMID2 = 2355 # A bug's life (1998)
        print "Similarity between %d and %d is %8.6f" % (ITEMID1, ITEMID2, svd.similarity(ITEMID1, ITEMID2))

    if True:
        print "Similar to %d" % (ITEMID1)
        for s in svd.similar(ITEMID1):
            print "\t%-5d\t%8.6f" % (s[0], s[1])

    if True:
        MIN_RATING = 0.0
        MAX_RATING = 5.0
        ITEMID = 1
        USERID = 1
        print "Predict : %5d" % (svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING))

        print "Get Matrix : %5d" % (svd.get_matrix().value(ITEMID, USERID))

        print "For User %d" % (USERID)
        for s in  svd.recommend(USERID, is_row = False):
            print "\t%-5d\t%8.6f" % (s[0], s[1])

        print "For Item %d" % (ITEMID)
        for s in  svd.recommend(ITEMID):
            print "\t%-5d\t%8.6f" % (s[0], s[1])


In [4]:
def bookISBN(br, isbn, k):
    n = 250
    top_n = br.ISBN.value_counts().index[ : n]
    df = br[br.ISBN.isin(top_n)]

    df_pivot = df.pivot("ISBN", "User-ID", "Book-Rating")
    df_pivot = df_pivot.fillna(0)

    if False:
        print df_pivot.info()
        print df_pivot.head()

    dists = cosine_similarity(df_pivot)

    dists = pd.DataFrame(dists)
    dists.columns = df_pivot.index
    dists.index = df_pivot.index

    if False:
        print dists.info()
        print dists.head()
        print dists[isbn].head()

    books_summed = dists[isbn].apply(lambda row: np.sum(row), axis = 1)
    books_summed = books_summed.order(ascending = False)

    ranked_books = books_summed.index[books_summed.index.isin(isbn) == False]

    ranked_books = ranked_books.tolist()

    print ranked_books[:k]


In [5]:
def bookUSER(br, user, k):
    n = 50
    top_n = br["User-ID"].value_counts().index[ : n]
    df = br[br["User-ID"].isin(top_n)]

    df_pivot = df.pivot("User-ID", "ISBN", "Book-Rating")
    df_pivot = df_pivot.fillna(0)

    if False:
        print df_pivot.info()
        print df_pivot.head()

    dists = cosine_similarity(df_pivot)

    dists = pd.DataFrame(dists)
    dists.columns = df_pivot.index
    dists.index = df_pivot.index

    if False:
        print dists.info()
        print dists.head()
        print dists[user].head()

    users_summed = dists[user].apply(lambda row: np.sum(row), axis = 1)
    users_summed = users_summed.order(ascending = False)

    ranked_users = users_summed.index[users_summed.index.isin(user) == False]

    ranked_users = ranked_users.tolist()

    print ranked_users[:k]


In [6]:
def disp():
    print "Book Ratings %d" % (len(bookRatings))
    print bookRatings.info()
    print "Users %d" % (len(users))
    print users.info()
    print "Books %d" % (len(books))
    print books.info()


In [7]:
def recommend(usr):
    svd = SVD()
    svd.load_data(filename = "nBX-Book-Ratings.csv", sep = ";", format = {'col' : 0, 'row' : 1, 'value' : 2, 'ids' : int})
    k = 100
    svd.compute(k = k, min_values = 10, pre_normalize = None, mean_center = True, post_normalize = True,  savefile = '/tmp/booklens')
    for s in svd.recommend(usr, is_row = False):
        bookframe = books[books.ISBN == s[0]]
        if (len(bookframe) == 0):
            ns = "0%s" % (s[0])
            bookframe = books[books.ISBN == ns]
            if (len(bookframe) == 0):
                continue;
        bookname = bookframe.to_string(columns = [1], index = False, header = False)
        print "%-15s %s" % (s[0], bookname)


In [8]:
class hwloc(object):
    def __init__(self, pname, pcount):
        self.name = pname
        self.count = pcount

def location():
    clist = {}
    uclist = []
    for looploc in users["Location"]:
        cname = looploc.split(',')
        if (len(cname) > 2):
            name = cname[2]
            if (name not in uclist):
                uclist.append(name)
                clist[name] = 1
            else:
                clist[name] += 1;

    locarr = [];
    for c in clist:
        lloc = hwloc(c, clist[c])
        locarr.append(lloc)

    slocarr = sorted(locarr, key = lambda hwloc: hwloc.count, reverse = True)

    for i in range(10):
        print "%-35s %20d" % (slocarr[i].name, slocarr[i].count)


In [9]:
lab14()

Loading ratings.dat
..........|
Creating matrix (1000209 tuples)
Matrix density is: 4.4684%
Updating matrix: squish to at least 10 values
Computing svd k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True
Saving svd model to /tmp/movielens
Similarity between 1 and 2355 is 0.677069
Similar to 1
	1    	1.000000
	3114 	0.870604
	2355 	0.677069
	588  	0.580735
	595  	0.460318
	1907 	0.445894
	364  	0.429082
	2081 	0.425666
	3396 	0.424741
	2761 	0.404394
Predict :     5
Get Matrix :     5
For User 1
	2028 	5.401845
	527  	5.349814
	2905 	5.213385
	318  	5.205211
	1193 	5.194219
	3114 	5.175394
	1    	5.171426
	2019 	5.103744
	1178 	5.096276
	1207 	5.090305
For Item 1
	869  	6.821550
	4086 	6.266765
	549  	6.239406
	1343 	6.216308
	1586 	6.039894
	840  	5.961663
	1676 	5.896234
	4595 	5.889457
	2691 	5.873509
	2665 	5.849869




In [10]:
disp()

Book Ratings 1149780
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 35.1+ MB
None
Users 278858
<class 'pandas.core.frame.DataFrame'>
Int64Index: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 8.5+ MB
None
Books 271379
<class 'pandas.core.frame.DataFrame'>
Int64Index: 271379 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271379 non-null object
Book-Title             271379 non-null object
Book-Author            271379 non-null object
Year-Of-Publication    271379 non-null int64
Publisher              271377 non-null object
Image-URL-S            271379 non-null object
Ima

In [11]:
bookISBN(bookRatings, ["0060987103"], 5)

['0060987529', '0375706771', '1400031354', '0064407667', '0385335881']


In [12]:
bookUSER(bookRatings, [11676], 5)

ValueError: Index contains duplicate entries, cannot reshape

In [13]:
recommend(11676)

Loading nBX-Book-Ratings.csv
..........|.
Creating matrix (1149780 tuples)
Matrix density is: 0.0012%
Updating matrix: squish to at least 10 values
Computing svd k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True
Saving svd model to /tmp/booklens
385504209        The Da Vinci Code
446611778        Last Man Standing
394900014        Cat in the Hat (I Can Read It All by Myself Be...
193156146X       The Time Traveler's Wife
394800184        Are You My Mother?
743454529        My Sister's Keeper : A Novel (Picoult, Jodi)
006440188X       The Secret Garden


In [14]:
location()

 usa                                              139188
 canada                                            21558
 united kingdom                                    18304
 germany                                           17052
 spain                                             13205
 australia                                         11724
 italy                                             11247
                                                    4608
 france                                             3474
 portugal                                           3371
