In [2]:
from collections import Counter
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
from scipy.stats.stats import pearsonr

In [3]:
already_extracted=True
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

if already_extracted:
    df = pd.read_pickle('../reviews_Kindle_Store_10_training.h5')
    Training_DataFrame = pd.read_pickle('../reviews_Kindle_Store_10_training.h5')
    Testing_DataFrame = pd.read_pickle('../reviews_Kindle_Store_10_testing.h5')
else:
    df = getDF('../reviews_Kindle_Store_5.json.gz')
    df = shuffle(df)
    Training_DataFrame, Testing_DataFrame = train_test_split(df, test_size=0.2)


In [4]:
print Training_DataFrame.shape
print Testing_DataFrame.shape

(453086, 13)
(50343, 13)


In [5]:
print "Dataset details:"
Uniq_items =  len(set(df['asin'].values))
print "Number of item:", Uniq_items
Uniq_users =   len(set(df['reviewerID'].values))
print "Number of Users:", Uniq_users
Uniq_ratings = len(set(df['overall'].values))
print "Number of ratings:", Uniq_ratings
print "\n"

Dataset details:
Number of item: 22203
Number of Users: 26466
Number of ratings: 5




In [6]:
#Training_DataFrame=Training_DataFrame[:5000]
Training_DataFrame.columns
# Lists of unique items and users
item_list = list(set(df['asin'].values))
user_list = list(set(df['reviewerID'].values))

#Storing the indices of users to generate the Rating Memory Matrix
UserIndices={}
for i in range(len(user_list)):
    UserIndices[user_list[i]] = i

#Storing the indices of items to generate the Rating Memory Matrix
#These indices will be used to populate the UsersRatingMemory for Memory based Collaborative Filtering
itemIndices={}
for j in range(len(item_list)):
    itemIndices[item_list[j]] = j
    

# Collaborative Filtering without Sentiment Scores: 

In [16]:
#UsersRatingMemory stores the users and corresponding items ratings in Matrix form
UsersRatingMemory=np.zeros((Uniq_users,Uniq_items))

for i in range(0, len(Training_DataFrame)):
    item_id=Training_DataFrame.iloc[i]['asin']
    user_id=Training_DataFrame.iloc[i]['reviewerID']
    #get user and item indices
    user_ind=UserIndices[user_id]
    item_ind=itemIndices[item_id]
    rating=Training_DataFrame.iloc[i]['overall']
    #populate the matrix with rating at corresponding user and item index
    UsersRatingMemory[user_ind][item_ind]=float(rating)

AvgRatings = np.true_divide(UsersRatingMemory.sum(1),(UsersRatingMemory!=0).sum(1))

print "Average Ratings of the Users are:"
print AvgRatings

Average Ratings of the Users are:
[4.92307692 5.         3.76923077 ... 3.84615385 3.5        4.64912281]


In [17]:
from scipy.stats.stats import pearsonr

def PredictUserRating(user_id, item_id):
    '''This function predicts the rating based on user_id and item_id'''
    item_ind = itemIndices[item_id]
    user_ind = UserIndices[user_id]
    #ratedindices: it stores the indices of all other users that have rated the item
    ratedindices=[]
    i=0
    for rating in UsersRatingMemory[:,item_ind]:
        if i==user_ind:
            continue
        if rating !=0:
            ratedindices.append(i)
        i+=1
    #itemRatingsOthers: stores other users ratings
    itemRatingsOthers = UsersRatingMemory[ratedindices, item_ind]
    #Store the pearson coefficents
    PearsonCoeffs=[]
    for ind in ratedindices:
        PearsonCoeffs.append(pearsonr(UsersRatingMemory[ind],UsersRatingMemory[user_ind])[0])
    k=0.001
    # Calculate the prediction
    pred = AvgRatings[user_ind] + k * np.sum( PearsonCoeffs* (itemRatingsOthers - AvgRatings[ratedindices]))
    return pred

def Test(no_of_samples):
    '''This function returns the predictions for given no of samples'''
    Predictions=np.zeros(no_of_samples)
    for i in range(no_of_samples):
        item_id=Testing_DataFrame.iloc[i]['asin']
        user_id=Testing_DataFrame.iloc[i]['reviewerID']
        Predictions[i]=PredictUserRating(user_id,item_id)
    return Predictions

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
Predictions=Test(1000)
print "For 1000 Samples:"
print "The Mean Absolute Error is:", mean_absolute_error(Predictions, Testing_DataFrame['overall'].values[:1000])
print "Root Mean Square Error is:", np.sqrt(mean_squared_error(Predictions, Testing_DataFrame['overall'].values[:1000]))

For 1000 Samples:
The Mean Absolute Error is: 0.5430276579670891
Root Mean Square Error is: 0.7988501360441151


In [11]:
def get_other_closest_user(user_ind,k):
    PearsonCoeffs={}
    for ind in range(len(user_list)):
        if ind==user_ind :
            continue
        else:
            PearsonCoeffs[ind]=pearsonr(UsersRatingMemory[ind],UsersRatingMemory[user_ind])[0]
        
    PearsonCoeffs_sorted=sorted(PearsonCoeffs.items(), key=lambda kv: kv[1], reverse=True)
    return [x for x, y in PearsonCoeffs_sorted[:k]]

In [12]:
def recommender(user_ind):
    items_to_recommend=[]
    colsest_users=get_other_closest_user(user_ind,20)
    for user in colsest_users:
        for items in np.nonzero(UsersRatingMemory[user]):
            for item in items:
                if UsersRatingMemory[user_ind][item] == 0:
                    items_to_recommend.append(item_list[item])
    
    recommendations = {}
    for item in items_to_recommend:
        recommendations[item] = PredictUserRating(user_list[user_ind], item)
        
    sorted_recommendations = sorted(recommendations.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_recommendations

In [13]:
print "getting recommendations for user:", user_list[0]
print "Top 50 recommendations are For Collaborative Filtering without sentiment Scores ..."
recommendations=recommender(0)
print recommendations[:50]

getting recommendations for user: AO00I2TVC9LBB
Top 10 recommendations are For Collaborative Filtering without sentiment Scores ...
[('B00KIXIBYQ', 4.966666666666667), ('B00KBDWHKM', 4.966666666666667), ('B00BTPOFSI', 4.966666666666667), ('B00FR3ZCG2', 4.966666666666667), ('B00GW7NHXM', 4.966666666666667), ('B00IW3BOC0', 4.966666666666667), ('B00KKQACDO', 4.966666666666667), ('B00AMIH8Y6', 4.966666666666667), ('B00L2FQEVG', 4.966666666666667), ('B00CMXCWW2', 4.966666666666667)]


# Colloboarative Filtering with Sentiment Scores:

In [8]:
UsersRatingMemory_sentiment=np.zeros((Uniq_users,Uniq_items))

for i in range(0, len(Training_DataFrame)):
    item_id=Training_DataFrame.iloc[i]['asin']
    user_id=Training_DataFrame.iloc[i]['reviewerID']
    #get user and item indices
    user_ind=UserIndices[user_id]
    item_ind=itemIndices[item_id]
    rating=float(Training_DataFrame.iloc[i]['overall'])
    
    if Training_DataFrame.iloc[i]['reviewText']:
    #populate the matrix with rating at corresponding user and item index
        rating=rating + float(Training_DataFrame.iloc[i]['corrected_sent_score'])
    
    UsersRatingMemory_sentiment[user_ind][item_ind]=rating

AvgRating_sentiment = np.true_divide(UsersRatingMemory_sentiment.sum(1),(UsersRatingMemory_sentiment!=0).sum(1))

print "Average Ratings of the Users are:"
print AvgRating_sentiment

Average Ratings of the Users are:
[5.19210769 5.352625   4.11755385 ... 4.19432308 3.7305625  4.78263509]


In [9]:

def PredictUserRatingsentiment(user_id, item_id):
    '''This function predicts the rating based on user_id and item_id'''
    item_ind = itemIndices[item_id]
    user_ind = UserIndices[user_id]
    #ratedindices: it stores the indices of all other users that have rated the item
    ratedindices=[]
    i=0
    for rating in UsersRatingMemory_sentiment[:,item_ind]:
        if i==user_ind:
            continue
        if rating !=0:
            ratedindices.append(i)
        i+=1
    #itemRatingsOthers: stores other users ratings
    itemRatingsOthers = UsersRatingMemory_sentiment[ratedindices, item_ind]
    #Store the pearson coefficents
    PearsonCoeffs=[]
    for ind in ratedindices:
        PearsonCoeffs.append(pearsonr(UsersRatingMemory_sentiment[ind],UsersRatingMemory_sentiment[user_ind])[0])
    k=0.001
    # Calculate the prediction
    pred = AvgRating_sentiment[user_ind] + k * np.sum( PearsonCoeffs* (itemRatingsOthers - AvgRating_sentiment[ratedindices]))
    return pred

def Testsentiment(no_of_samples):
    '''This function returns the predictions for given no of samples'''
    Predictions=np.zeros(no_of_samples)
    for i in range(no_of_samples):
        item_id=Testing_DataFrame.iloc[i]['asin']
        user_id=Testing_DataFrame.iloc[i]['reviewerID']
        Predictions[i]=PredictUserRatingsentiment(user_id,item_id)
    return Predictions

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
Predictions=Testsentiment(1000)
print "For 1000 Samples:"
print "The Mean Absolute Error is:", mean_absolute_error(Predictions, Testing_DataFrame['overall'].values[:1000])
print "Root Mean Square Error is:", np.sqrt(mean_squared_error(Predictions, Testing_DataFrame['overall'].values[:1000]))

For 1000 Samples:
The Mean Absolute Error is: 0.5915488267025667
Root Mean Square Error is: 0.8618822029935236


In [19]:
def get_other_closest_user_sentiment(user_ind,k):
    PearsonCoeffs={}
    for ind in range(len(user_list)):
        if ind==user_ind :
            continue
        else:
            PearsonCoeffs[ind]=pearsonr(UsersRatingMemory_sentiment[ind],UsersRatingMemory_sentiment[user_ind])[0]
        
    PearsonCoeffs_sorted=sorted(PearsonCoeffs.items(), key=lambda kv: kv[1], reverse=True)
    return [x for x, y in PearsonCoeffs_sorted[:k]]

In [20]:
def recommender_sentiments(user_ind):
    items_to_recommend=[]
    colsest_users=get_other_closest_user_sentiment(user_ind,20)
    for user in colsest_users:
        for items in np.nonzero(UsersRatingMemory_sentiment[user]):
            for item in items:
                if UsersRatingMemory_sentiment[user_ind][item] == 0:
                    items_to_recommend.append(item_list[item])
    
    recommendations = {}
    for item in items_to_recommend:
        recommendations[item] = PredictUserRatingsentiment(user_list[user_ind], item)
        
    sorted_recommendations = sorted(recommendations.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_recommendations

In [21]:
print "getting recommendations for user:", user_list[0]
print "Top 50 recommendations are For Collaborative Filtering with sentiment Scores ..."
recommendations_sentiment=recommender_sentiments(0)


getting recommendations for user: AO00I2TVC9LBB
Top 50 recommendations are For Collaborative Filtering with sentiment Scores ...


In [15]:
print recommendations_sentiment[:50]

[('B00GO6VI50', 5.192107692307692), ('B00B3KPBSW', 5.192107692307692), ('B009P59SFY', 5.192107692307692), ('B00HLX9Z3C', 5.192107692307692), ('B00JHNAWQS', 5.192107692307692), ('B00KA0AGJK', 5.192107692307692), ('B00GW9H8XA', 5.192107692307692), ('B002HUU0A6', 5.192107692307692), ('B0091V9BPE', 5.192107692307692), ('B00KXIIZKG', 5.192107692307692), ('B00L7DRQME', 5.192107692307692), ('B002FK3U00', 5.192107692307692), ('B00KF0URBM', 5.192107692307692), ('B00KKQACDO', 5.192107692307692), ('B008EXK208', 5.192107692307692), ('B00KBA6PKS', 5.192107692307692), ('B00BN1E6SW', 5.192107692307692), ('B00EUSSY3S', 5.192107692307692), ('B00EZ8PNK0', 5.192107692307692), ('B00CMXCWW2', 5.192107692307692), ('B000R93D4Y', 5.192107692307692), ('B00KY7X0OC', 5.192107692307692), ('B00CI95PS8', 5.192107692307692), ('B00GKKU0OK', 5.192107692307692), ('B00JWET7TU', 5.192107692307692), ('B00GMSI5BA', 5.192107692307692), ('B0064EIC9C', 5.192107692307692), ('B005NKEEYK', 5.192107692307692), ('B00HBY8SIY', 5.19