In [1]:
import pandas as pd
import numpy as np
Office = pd.read_pickle("office_products.pickle")
Office_reviews = pd.read_pickle("office_reviews.pickle")

# 1 AMAZON ITEM-TO-ITEM SIMILARITY RECOMMENENDER

In [2]:
# Extract the last category in the “Office.categories” as a new column “Categories”
Office["categories"] = [[i][0][0][-1] for i in Office.categories]
# drop useless columns.
Office = Office.dropna(subset = ["title","description"]).reset_index()[[1,2,3,4,5,6,9]]

# Merge "Office_reviews" with "Office" by "asin"
Office_reviews = Office_reviews.merge(Office, left_on = "asin", right_on='asin')
Office_reviews = Office_reviews[[0,2,5]]

## 1.1 Build similar items table
### 1.1.1 Retrive purchase histories for all customers, then calculate the purchase amount for each user

In [3]:
# Only keep the "asin" and "reviewerID"
U_to_I = Office_reviews[[0,2]]
# melt the dataframe and set the varibles as "reviewerID"
U_to_I = pd.melt(U_to_I, id_vars=['reviewerID'], value_vars=['asin'])
# Group by "reviewerID", find all the products he or she have reviewed, and append them to a list.
U_to_I = U_to_I.groupby(["reviewerID"])["value"].apply(lambda x: "[%s]" % ','.join(x))

# Turn the result into a data frame
user_item = pd.Series(U_to_I, name='asin')
user_item.index.name = 'user'
user_item = user_item.reset_index()
user_item["asin"] = user_item["asin"].str.extract("\[(.*)\]",expand=False)
user_item["asin"] = user_item["asin"].str.split(",")
# Use reviewed counts as purchase amount.
user_item["counts"] = [len(i) for  i in user_item.asin]
user_item = user_item.sort_values(by="counts",ascending=False).reset_index()[[1,2,3]]
user_item.head()

Unnamed: 0,user,asin,counts
0,A3OXHLG6DIBRW8,"[B00004Z6NA, B000050FZP, B00005C3YY, B00005C55...",93
1,A22I55P15NSAOX,"[B00004Z64U, B00004Z69W, B00006B7UW, B00006B8G...",80
2,A1Z16630QMH8Q6,"[B00004Z5QO, B00004Z5SM, B00006B8G0, B00006IC3...",76
3,A1ODOGXEYECQQ8,"[B00005UKAX, B00006ICF7, B00006IDV6, B00006IF7...",75
4,A22CW0ZHY3NJH8,"[B00004VVIX, B00004YV1W, B00004Z5QO, B00006IC7...",73


### 1.1.2 Calculate the sales of each product

In [4]:
# Count the sales of each product to get a "item_user" data frame.
I_to_U = Office_reviews[[0,2]]
I_to_U = pd.melt(I_to_U, id_vars=['asin'], value_vars=['reviewerID'])
I_to_U = I_to_U.groupby(["asin"])["value"].apply(lambda x: "[%s]" % ', '.join(x))

item_user = pd.Series(I_to_U, name='user')
item_user.index.name = 'asin'
item_user = item_user.reset_index()
item_user["user"] = item_user.user.str.extract("\[(.*)\]",expand=False)
item_user["user"] = item_user["user"].str.split(",")
item_user["sales"] = [len(i) for  i in item_user.user]
item_user = item_user.sort_values(by="sales",ascending=False).reset_index()[[1,2,3]]
item_user.head()

Unnamed: 0,asin,user,sales
0,B0010T3QT2,"[A3A1OA237FOZFK, A4QXTN3K865NR, A1HRYC60VTMY...",311
1,B0039N7ELS,"[A2E1EFNIZL2FVA, A2X396SN5F0H0S, A3EZEP0FX5B...",227
2,B0027CTFBO,"[A3A1OA237FOZFK, A2E1EFNIZL2FVA, A2BB4DGBRVG...",205
3,B0039N3QFQ,"[A2DYMDOAJURO5F, A27HJBFDSTFVFG, AUG0C084W05...",186
4,B000MFHX3U,"[A231FGX8G2Q5EW, A2OR4QUQSUMOW7, A2VIX3WXF4H...",185


### 1.1.3 For each (item to items) pair, count number of customers in common

In [5]:
#Compare all the reviewers of a product with each product's reviewers, include itself, to find their common customers.
common_user = []

# for all the reviewers of each product,
for p in item_user.user:
    # for all the reviewers of each product,
    for i in item_user.user:
        # Find common reviewers.
        intersect = list(set(p).intersection(i))
        # Count common reviewers number.
        count = len(intersect)
        # Append the result to the list.
        common_user.append(count)
        
# Save all results as an array.
common_user = np.array(common_user)
# Reshape the array with a same length and width of "Office".
common_users = common_user.reshape(len(Office),-1)

### 1.1.4 Compute commonality indexes

In [6]:
# Build another data frame with same value of common, but index and columns' name are the sales counts of that product.
common_index = pd.DataFrame(data = common_users,
                            index = item_user["sales"].values, 
                            columns = item_user["sales"].values)
# Count the commonality_indexes based on the formula.
# Multiply the sales of each pair of products and save the results as a matrix.
C_index = np.array([[i*j for j in common_index.columns] for i in common_index.index])
# Divide the prescription of the products by common values
C_index = common_users / np.sqrt(C_index)
C_index

array([[ 1.        ,  0.21829052,  0.14257566, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21829052,  1.        ,  0.14370499, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.14257566,  0.14370499,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [7]:
# Build a data frame with "a" as the data and match the index and columns' names with their "asin".
commonality_indexes = pd.DataFrame(data = C_index, 
                                   index = item_user["asin"].values, 
                                   columns = item_user["asin"].values)
commonality_indexes.iloc[0:5,0:5]

Unnamed: 0,B0010T3QT2,B0039N7ELS,B0027CTFBO,B0039N3QFQ,B000MFHX3U
B0010T3QT2,1.0,0.218291,0.142576,0.257783,0.125071
B0039N7ELS,0.218291,1.0,0.143705,0.1606,0.126875
B0027CTFBO,0.142576,0.143705,1.0,0.107544,0.128374
B0039N3QFQ,0.257783,0.1606,0.107544,1.0,0.086254
B000MFHX3U,0.125071,0.126875,0.128374,0.086254,1.0


## 1.2 Identify popular items in each category

In [8]:
popularity = item_user
# mark the category for each product for future uses.
popularity = popularity.merge(Office, left_on="asin", right_on="asin").set_index("asin").drop(["user","description"],1)
popularity = popularity[[5,1,4,0,3,2]]
popularity = popularity.dropna(subset=["title"])
popularity.head()

Unnamed: 0_level_0,title,brand,price,sales,imUrl,categories
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0010T3QT2,"Quality Park Reveal-N-Seal Envelope, #10, 4-1/...",Quality Park,7.9,311,http://ecx.images-amazon.com/images/I/41f9G1jL...,Envelopes
B0039N7ELS,"Scotch Adhesive Dot Roller, 0.31 x 49 Feet (6055)",Unknown,6.19,227,http://ecx.images-amazon.com/images/I/41n5MKtP...,Mounting Tape
B0027CTFBO,Bankers Box SmoothMove Moving and Storage Boxe...,Fellowes,21.79,205,http://ecx.images-amazon.com/images/I/416ttilp...,Box Mailers
B0039N3QFQ,Scotch(R) Pop-Up Tape Refillable Deskgrip Disp...,Scotch,3.99,186,http://ecx.images-amazon.com/images/I/41O9sCwe...,Transparent Tape
B000MFHX3U,"3M Whiteboard Eraser for Whiteboards, 2-Pack",3M,7.67,185,http://ecx.images-amazon.com/images/I/41%2Bjwd...,Dry Erase Boards


## 1.3 Define a funciton to get "also bought" items for a product

In [9]:
def get_also_bought(item):
    # get the column of this item in "commonality_indexes"
    all_items = commonality_indexes[item]
    # sum the value of whole column and to determine whether it is not 0.
    sum(all_items) != 0
    # if it is not, means that this item has its similar items with common customers.
    while True:
        # 
        similar_items = all_items[all_items != 0]
        # cause the first similar item must be itself, so if similar items are more than 11,
        if len(similar_items)>=11:
            # sort these items by their values.
            similar_items = similar_items.sort_values(ascending=False)[1:11]
            # get those item from "categries_items".
            recommenddations = popularity.ix[similar_items.index]
            recommenddations["common_index"] = similar_items.values
            return recommenddations
        #if less than 11,
        else:
            # get the category of this item from "Office".
            item_category = Office[Office.asin == item].Categories.values[0]
            # get the popular items that have a same category with this item.
            # only to fill the number of missing, not a complete replacement.
            popular = popularity[categries_items.Categories == item_category].head(10-len(similar_items))
            # append those popular items to fill up recommendations to 10.
            recommenddations = similar_items.append(popular)
            recommenddations["common_index"] = similar_items.values
            return recommenddations

# 2 CONTENT BASED RECOMMENDER
## 2.1 Build item similarity matric based on item content

In [10]:
reviews = pd.read_pickle("office_reviews.pickle")
product = pd.read_pickle("office_products.pickle")
# drop certain columns and extract categories for each product
product = product.dropna(subset=["title", "description"])
product["categories"] = [ (','.join(i[0])) for i in product.categories]

In [11]:
import re
import nltk
from nltk.stem import PorterStemmer
import string

# for the text of description, exclude numbers, meaningless words like "00asdf"(number+text) or "asdf00"(text+number)
summarylist=[]
for sent in product.description:
    punct_num_word= r='[^A-Za-z0-9 ]|[0-9]{1,10}[A-Za-z]{1,10}|[A-Za-z]{1,10}[0-9]{1,10}|[0-9]'
    sent=re.sub(punct_num_word,'',sent)
    sent=sent.split( )
    newsent=[]
    for word in sent:
        word=PorterStemmer().stem(word)
        newsent.append(word)
    sent=' '.join(newsent)
    summarylist.append(sent)
product['description2'] = summarylist

In [12]:
# do the same thing for product.title
summarylist = []
for sent in product.title:
    punct_num_word = r='[^A-Za-z0-9 ]|[0-9]{1,10}[A-Za-z]{1,10}|[A-Za-z]{1,10}[0-9]{1,10}|[0-9]'
    sent = re.sub(punct_num_word,'',sent)
    sent = sent.split( )
    newsent = []
    for word in sent:
        word=PorterStemmer().stem(word)
        newsent.append(word)
    sent=' '.join(newsent)
    summarylist.append(sent)
product['title2'] = summarylist

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# create dtm for content
tf = TfidfVectorizer(stop_words = 'english')
dtm = tf.fit_transform(product.description2 + product.title2+ product.categories)

In [14]:
# count similarity of items
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(dtm)
sim 

array([[ 1.        ,  0.16506951,  0.19255795, ...,  0.00196088,
         0.01676924,  0.02679743],
       [ 0.16506951,  1.        ,  0.30763651, ...,  0.00148237,
         0.0015545 ,  0.00949996],
       [ 0.19255795,  0.30763651,  1.        , ...,  0.01880499,
         0.02610376,  0.01324837],
       ..., 
       [ 0.00196088,  0.00148237,  0.01880499, ...,  1.        ,
         0.90569139,  0.00763796],
       [ 0.01676924,  0.0015545 ,  0.02610376, ...,  0.90569139,
         1.        ,  0.01742184],
       [ 0.02679743,  0.00949996,  0.01324837, ...,  0.00763796,
         0.01742184,  1.        ]])

In [15]:
# transform metric to dataframe
item_similarity = pd.DataFrame(data = sim, 
                               index = product.asin.values, 
                               columns = product.asin.values)
item_similarity.iloc[0:5,0:5]

Unnamed: 0,B00000JBLU,B00000JBNX,B00000JBO8,B00000JBLH,B00000JFNV
B00000JBLU,1.0,0.16507,0.192558,0.080616,0.001946
B00000JBNX,0.16507,1.0,0.307637,0.135672,0.019484
B00000JBO8,0.192558,0.307637,1.0,0.123718,0.001698
B00000JBLH,0.080616,0.135672,0.123718,1.0,0.01359
B00000JFNV,0.001946,0.019484,0.001698,0.01359,1.0


## 2.2 Define function to do a recommendation

In [16]:
def get_similar_item(asin, **kwargs):
    userid = kwargs.get('userid', None)
    if userid is not None:
        # get rating information of this user
        ratings = reviews[reviews.reviewerID == userid][["asin", "overall"]]
        # mean center the ratings
        ratings.overall -= ratings.overall.mean()
        # merge popular items with ratings
        product_ID = product.set_index("asin").join(ratings.set_index("asin"))
        # do the dot product with dtm
        profile = dtm.T.dot(product_ID.overall.fillna(0))
        # get prediction for this user
        pred = dtm.dot(profile)
        product_ID['pred'] = pred
        all_items = item_similarity[asin].sort_values(ascending=False)
        # extract items that has positive prediction from "all_items"
        pos_pred=product_ID.ix[all_items.index]
        pos_pred=pos_pred[pos_pred.pred>=0]
        pos_items=all_items.ix[pos_pred.index]
        # get the top 5 items that have the highest similarity for a certain item("asin")
        top_items=pos_items[pos_items.values < 1][0:5]
        recommendations = product_ID.ix[top_items.index]
        recommendations = recommendations[["title","brand","price","description","imUrl","pred"]]
        recommendations["similarity"] = top_items.values
        # return these items according to the "pred" value
        return recommendations.sort_values("similarity", ascending=False)
    else:
        # for the user doesn't sign in, recommend top 5 items that have the highest similarity for him/her
        all_items = item_similarity[asin].sort_values(ascending=False)
        all_items = all_items[all_items.values < 1][0:5]
        recommendations = popularity.ix[all_items.index]
        recommendations["similarity"] = all_items.values
        return recommendations

# 3 MODEL BASED RECOMMENDER

In [17]:
#split data
from sklearn.cross_validation import train_test_split
train,test = train_test_split(Office_reviews,test_size=0.25,random_state = 1)

#compute mean ratings for every users in train data and test data
means = train.groupby("reviewerID").overall.mean()#only use the information in train
means.name = "mean_rating"
train = pd.merge(train, pd.DataFrame(means), left_on="reviewerID", right_index=True)
train["mcrating"] = train.overall - train.mean_rating
test = pd.merge(test, pd.DataFrame(means), left_on="reviewerID", right_index=True)
test["mcrating"] = test.overall - test.mean_rating
test.mean_rating.fillna(train.overall.mean(), inplace=True)
test[np.isnan(test.mcrating)] #check if there are missing values in test.mcrating

Unnamed: 0,asin,overall,reviewerID,mean_rating,mcrating


In [18]:
#create user_item matrix by using train data
user_item_rating_train = pd.pivot_table(train,index = ["reviewerID","asin"],values=["mcrating"],aggfunc=np.mean)
user_item_rating_train = user_item_rating_train.unstack("asin").stack(0)
#create a varible which stores filled missing values
user_item_rating_train_filled = user_item_rating_train.fillna(0)
#create a varible which stores missing values
user_item_rating_train_nan = user_item_rating_train

In [19]:
#create user_item matrix by using test data
user_item_rating_test = pd.pivot_table(test,index = ["reviewerID","asin"],values=["mcrating"],aggfunc=np.mean)
user_item_rating_test = user_item_rating_test.unstack("asin").stack(0)

## 3.1 Run svds and compute RMSE

In [20]:
#convert dataframe to array
from scipy.sparse.linalg import svds
user_item_rating_train_filled = user_item_rating_train_filled.values
user_item_rating_train_nan = user_item_rating_train_nan.values

for i in range(1):

    print (u'num of loop: ' + str(i+1))    
    u, s, vt = svds(user_item_rating_train_filled, k = 20)
    predication = np.dot(np.dot(u,np.diag(s)),vt).round(2)
    #Backfilling mising values 
    user_item_rating_train_filled[np.isnan(user_item_rating_train_nan)] = predication[np.isnan(user_item_rating_train_nan)] 
    
    #next,compute rmse
    #create predication dataframe
    pred_df = pd.DataFrame(data = predication,
                           index=user_item_rating_train.index.get_level_values("reviewerID"),
                           columns=user_item_rating_train.columns.values)
    #create predication_test dataframe 
    pred_test = pred_df.loc[user_item_rating_test.index.get_level_values("reviewerID").values,
                            user_item_rating_test.columns.values]
    #compute rmse between predicted test data(pred_test) and test data(user_item_rating_test)
    error = (pred_test - user_item_rating_test).values
    error = error[~np.isnan(error)]
    rmse = np.sqrt((error**2).mean())

    print(rmse)
    print (u'---------------------' )

num of loop: 1
0.926649363224
---------------------


# 3.2 Define a funcation to get recommendations 

In [21]:
def user_recommend_item(userid):
    items = pred_df.loc[userid]
    #sort according to rating scores
    if len(items[items>0])>=10:
        items_10 = items.sort_values(ascending=False).head(10)
        recommenddations = popularity.ix[items_10.index.values]
        recommenddations["pred"] = items_10.values
        return recommenddations
    if len(items[items>0])<10 and len(items[items>0])>0:
        items_x = items[items>0].sort_values(ascending=False)
        items_10_x = popularity.head(10-len(items[items>0]))
        recommenddations = np.concatenate((items_x.index.values,items_10_x.index.values ))
        recommenddations = popularity.ix[recommenddations]
        return recommenddations
    else:
        recommenddations =  popularity.head(10)
        return recommenddations

# 4 Evaluation

## 4.1 Qualitative evaluation

Test all the functions that we built above with same user and product.

In [22]:
user1 = "A27BM0VZSVKXZD"
user2 = "A1JZFGZEZVWQPY"
item1 = item_user.asin[123]
item2 = item_user.asin[345]
user1,user2,item1,item2 

('A27BM0VZSVKXZD', 'A1JZFGZEZVWQPY', 'B0007L1VO6', 'B000J0E9PI')

In [23]:
popularity[popularity.index == item1].reset_index()

Unnamed: 0,asin,title,brand,price,sales,imUrl,categories
0,B0007L1VO6,BIC Mark-It Color Collection Permanent Markers...,BIC,7.16,75,http://ecx.images-amazon.com/images/I/61zc6AGL...,Permanent Markers


In [24]:
popularity[popularity.index == item2].reset_index()

Unnamed: 0,asin,title,brand,price,sales,imUrl,categories
0,B000J0E9PI,"Smead Recycled File Jackets, 2-Inch Expansion,...",Smead,29.38,45,http://ecx.images-amazon.com/images/I/31GMXm2L...,Top Tab Jackets & Pockets


### 4.1.1  item-to-item similarity recommender

In [25]:
get_also_bought(item1).head()

Unnamed: 0,title,brand,price,sales,imUrl,categories,common_index
B00347A8PI,BIC Mark-It Color Collection Ultra Fine Perman...,BIC,20.49,63,http://ecx.images-amazon.com/images/I/51mYUlaz...,Permanent Markers,0.145479
B0046HESE2,"Fellowes Titan 125 Laminator, 12.5 Inch with 1...",,,17,http://ecx.images-amazon.com/images/I/31dGn7ku...,Laminators,0.112022
B0006HX1HG,Duck Brand 393185 1.88 Inch by 22.2 Yard EZ St...,Shurtech Brands LLC,5.17,54,http://ecx.images-amazon.com/images/I/41hFF656...,Packing Tape,0.109994
B002M7Z338,"Sharpie Pen Fine Point Pen, 6 Colored Pens (17...",Sharpie,8.82,10,http://ecx.images-amazon.com/images/I/51QLoJQ0...,Porous-Point Pens,0.109545
B000WU4H5C,3M Gel Wrist Rest for Keyboard with Leathette ...,3M,19.11,176,http://ecx.images-amazon.com/images/I/31wudwyG...,Wrist Rests,0.104447


In [26]:
get_also_bought(item2).head()

Unnamed: 0,title,brand,price,sales,imUrl,categories,common_index
B0085IPYO6,"Smead MO File Box, Letter Size, 3 inch Expansi...",Smead,8.9,90,http://ecx.images-amazon.com/images/I/41JwJIO7...,All-Purpose Labels,0.172848
B008DF54N2,"Post-it Full Adhesive Roll, 1 x 400, Pink, 1-P...",Unknown,4.91,156,http://ecx.images-amazon.com/images/I/51PtlKvc...,Self-Stick Notes,0.119352
B0017GPTSY,"Smead Folder, Letter, 11 Point, 1/3 Cut Tab, C...",Smead,14.84,41,http://ecx.images-amazon.com/images/I/41qD9pcf...,Manila,0.116405
B000SHR71A,"Post-it Tabs, 2-Inches, Angled Lined, 4 Assort...",Post-It,4.94,49,http://ecx.images-amazon.com/images/I/51uVy0NO...,Index Tabs,0.106479
B00006IEAK,"Pentel R.S.V.P. Ballpoint Pen, 0.7mm Fine Tip,...",Pentel,7.68,8,http://ecx.images-amazon.com/images/I/41oeWJYK...,Ballpoint Pens,0.105409


### 4.1.2 Content based recommender

In [27]:
get_similar_item(item1, userid=user1)

Unnamed: 0,title,brand,price,description,imUrl,pred,similarity
B0003WN0CA,Sharpie 32893PP Ultra-Fine Point Permanent Mar...,Sharpie,19.98,Sharpie 32893PP Ultra-Fine Point Permanent Mar...,http://ecx.images-amazon.com/images/I/61pUYpZI...,0.014016,0.380864
B001E681S2,"Pentel Permanent Marker, White, Fine Point, 1-...",Pentel,5.49,"Quick-drying, permanent, white opaque marker w...",http://ecx.images-amazon.com/images/I/31j-LcWP...,0.018049,0.373153
B000I0YYZO,Sharpie Permanent Marker Fine Tip 8 Pack,Sharpie,2.45,Original Sharpie pen-style permanent marker ma...,http://ecx.images-amazon.com/images/I/51-UDFTD...,0.021759,0.346891
B001V9LQLG,Sharpie 1747388 Stainless Steel Fine Point Per...,Sharpie,5.25,Sharpie 1747388 Stainless Steel Fine Point Per...,http://ecx.images-amazon.com/images/I/21qDdPfl...,0.015425,0.339191
B000GOZYRO,"Sharpie 75846 Fine Point Permanent Marker, Ass...",Sharpie,12.43,"Sharpie 75846 Fine Point Permanent Marker, Ass...",http://ecx.images-amazon.com/images/I/51MP4u6p...,0.014191,0.30758


In [28]:
get_similar_item(item2, userid=user1)

Unnamed: 0,title,brand,price,description,imUrl,pred,similarity
B003OCQOLA,"Five Star 6-Pocket Expanding File, 13 x 9.38 I...",Five Star,7.19,,http://ecx.images-amazon.com/images/I/41M9xwsc...,0.05216,0.6286
B003EM2G6M,"Smead 70211 12-Pocket Tiered Stadium File, Let...",Smead,14.49,The 12-pocket Stadium&#xAE; File is ideal for ...,http://ecx.images-amazon.com/images/I/51VLRflJ...,0.024225,0.489496
B0052S3ACK,Five Star 7-Pocket Customizable Expanding File...,Five Star,12.88,Keep your most important documents safe and se...,http://ecx.images-amazon.com/images/I/41yEa5Nm...,0.078202,0.466442
B0002LCUZK,"Smead Frequency Expanding Files, Waterproof an...",Smead,8.5,Super durable poly. Waterproof and tear resist...,http://ecx.images-amazon.com/images/I/41Aegz4t...,0.077091,0.465719
B00006ICDC,Wilson Jones ColorLife Expanding File with Fla...,Wilson Jones,18.8,"Filing, organizing and storing loose papers is...",http://ecx.images-amazon.com/images/I/517XAQs0...,0.058851,0.451076


### 4.1.3 Model based recommender

In [29]:
user_recommend_item(user1).head()

Unnamed: 0_level_0,title,brand,price,sales,imUrl,categories,pred
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
B0085IPZGS,"Smead MO Records Filing Starter Kit, Letter Si...",Smead,11.45,90,http://ecx.images-amazon.com/images/I/41ltgWeE...,All-Purpose Labels,0.11
B005X9VZ70,DYMO LabelManager 160 Hand Held Label Maker,DYMO,15.55,92,http://ecx.images-amazon.com/images/I/41x99jCg...,Label Makers,0.09
B002ECFIDG,"Avery Removable Label Pad, 1 x 3 Inches, Assor...",Avery,7.85,163,http://ecx.images-amazon.com/images/I/51vPBlsg...,Removable Labels,0.06
B00FWR9NH2,Wilson Jones Ultra Duty Round Ring View Binder...,Wilson Jones,7.29,107,http://ecx.images-amazon.com/images/I/41VvaJxF...,Round Ring Binders,0.04
B002PU9T5A,Pendaflex I.Organize Project Management System...,,,54,http://ecx.images-amazon.com/images/I/51A5ht2v...,Binding Covers & Paper,0.04


In [30]:
user_recommend_item(user2).head()

Unnamed: 0_level_0,title,brand,price,sales,imUrl,categories,pred
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
B0010T3QT2,"Quality Park Reveal-N-Seal Envelope, #10, 4-1/...",Quality Park,7.9,311,http://ecx.images-amazon.com/images/I/41f9G1jL...,Envelopes,1.07
B002K9M6OW,Post-it Super Sticky Removable Color Coding La...,,,185,http://ecx.images-amazon.com/images/I/51n5vSVf...,Store Signs,0.19
B002K9XU0Q,Post-it&reg; Super Sticky Removable File Folde...,,,175,http://ecx.images-amazon.com/images/I/51WyeTP1...,File Folder Labels,0.17
B002K9IHJK,Post-it&reg; Super Sticky Removable Label Pads...,,4.99,180,http://ecx.images-amazon.com/images/I/51sdIEWS...,Mouse Pads,0.16
B002K9GOPE,"3M Permanent Adhesive Address Labels, 1 x 2.62...",3M,35.0,182,http://ecx.images-amazon.com/images/I/51cAE15g...,Address Labels,0.12


## 4.2 Quantitative evaluation

In [31]:
from sklearn.cross_validation import train_test_split
train,test = train_test_split(Office_reviews,test_size=0.25,random_state = 1)

means = train.groupby("reviewerID").overall.mean()
means.name = "mean_rating"
train = pd.merge(train, pd.DataFrame(means), left_on="reviewerID", right_index=True)
train["mcrating"] = train.overall - train.mean_rating 
# merge test data with means (training data's mean rating)
test = pd.merge(test, pd.DataFrame(means), left_on="reviewerID", right_index=True)
test["mcrating"] = test.overall - test.mean_rating 
test.mean_rating.fillna(train.overall.mean(), inplace=True)
test[np.isnan(test.mcrating)] #check if there are missing values in test.mcrating

Unnamed: 0,asin,overall,reviewerID,mean_rating,mcrating


### 4.2.1 item-to-item similarity recommender

In [32]:
df_list1=[]
for i in set(test.reviewerID):
    user_ratings = test[test.reviewerID == i][["asin","mcrating"]]
    # merge product data frame with user's ratings
    item_user = Office.reset_index()[["asin"]].set_index("asin").join(user_ratings.set_index("asin"))
    # add reviewerID column
    item_user['reviewerID'] = i
    # get profile of user "i"
    profile = C_index.T.dot(np.array(item_user.mcrating.fillna(0)))
    pred = C_index.dot(profile)
    # get predictions
    item_user['pred'] = pred
    item_user = item_user.drop("mcrating",axis=1).dropna()
    df_list1.append(item_user)
    
df_pred1 = pd.concat(df_list1)
df_pred1 = df_pred1.reset_index()

In [33]:
test_1 = pd.merge(test, df_pred1, left_on=["reviewerID","asin"],right_on=["reviewerID","asin"])
test_1['pred_abs'] = test_1.pred + test_1.mean_rating #overall_y is the mean rating
test_1.head()

Unnamed: 0,asin,overall,reviewerID,mean_rating,mcrating,pred,pred_abs
0,B000087KUA,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,-0.300602,4.612441
1,B00006JNMH,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,-0.499916,4.413127
2,B00006IFBG,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,-0.640811,4.272232
3,B00004SUJT,4.0,A2NOW4U7W3F7RI,4.913043,-0.913043,-2.171027,2.742017
4,B00125Q75Y,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,-0.168818,4.744226


In [34]:
import math
math.sqrt(((test_1.pred_abs - test_1.overall)**2).mean())

0.8905973477735801

### 4.2.1 Content based recommender

In [35]:
df_list2=[]
for i in set(test.reviewerID):
    user_ratings = test[test.reviewerID == i][["asin","mcrating"]]
    product_user = Office.reset_index()[["asin"]].set_index("asin").join(user_ratings.set_index("asin"))
    product_user['reviewerID']=i
    profile = dtm.T.dot(product_user.mcrating.fillna(0))
    pred = dtm.dot(profile)
    product_user[ 'pred' ] = pred
    product_user = product_user.drop("mcrating",axis=1).dropna()
    df_list2.append(product_user)
    
df_pred2=pd.concat(df_list2)
df_pred2=df_pred2.reset_index()

In [36]:
test_2 = pd.merge(test,df_pred2,left_on=["reviewerID","asin"],right_on=["reviewerID","asin"])
test_2['pred_abs'] = test_2.pred + test_2.mean_rating
test_2.head()

Unnamed: 0,asin,overall,reviewerID,mean_rating,mcrating,pred,pred_abs
0,B000087KUA,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,0.091831,5.004874
1,B00006JNMH,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,0.083583,4.996627
2,B00006IFBG,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,0.087913,5.000957
3,B00004SUJT,4.0,A2NOW4U7W3F7RI,4.913043,-0.913043,-0.899872,4.013172
4,B00125Q75Y,5.0,A2NOW4U7W3F7RI,4.913043,0.086957,0.094053,5.007097


In [37]:
math.sqrt(((test_2.pred_abs - test_2.overall)**2).mean())

0.24615493236677025

### 4.1.3 Model based recommender

In [38]:
rmse

0.9266493632239885

As is shown in code[20], the rmse is 0.926