In [26]:
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

# Add Dataset

In [27]:
data = pd.read_csv('updated_reviews.csv')
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment
0,US,1797882.0,R3I2DHQBR577SS,B001ANOOOE,2102612.0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5.0,0.0,0.0,N,Y,Five Stars,love excel sun block,2015-08-31,5
1,US,18381298.0,R1QNE9NQFJC2Y4,B0016J22EQ,106393691.0,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5.0,0.0,0.0,N,Y,Thank you Alba Bontanica!,great thing cream doesnt smell weird like chem...,2015-08-31,5
2,US,19242472.0,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471.0,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5.0,0.0,0.0,N,Y,Five Stars,great product im year old claim,2015-08-31,5
3,US,19551372.0,R3KSZHPAEVPEAL,B002HWS7RM,255651889.0,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5.0,0.0,0.0,N,Y,GOOD DEAL!,use shower cap condit cap like theyr bulk save...,2015-08-31,4
4,US,14802407.0,RAI2OIG50KZ43,B00SM99KWU,116158747.0,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5.0,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,goto daili sunblock leav white cast clean plea...,2015-08-31,5


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   marketplace          30000 non-null  object 
 1   customer_id          30000 non-null  float64
 2   review_id            30000 non-null  object 
 3   product_id           30000 non-null  object 
 4   product_parent       30000 non-null  float64
 5   product_title        30000 non-null  object 
 6   product_category     30000 non-null  object 
 7   star_rating          30000 non-null  float64
 8   helpful_votes        30000 non-null  float64
 9   total_votes          30000 non-null  float64
 10  vine                 30000 non-null  object 
 11  verified_purchase    30000 non-null  object 
 12  review_headline      30000 non-null  object 
 13  review_body          29962 non-null  object 
 14  review_date          30000 non-null  object 
 15  predicted_sentiment  30000 non-null 

## Amazingmeter

In [29]:
# sample dataframe
df = pd.DataFrame(data)

# create "Amazingmeter" column
df["Amazingmeter"] = ""
for i, row in df.iterrows():
    pid = row["product_id"]
    rating = row["star_rating"]
    votes = row["helpful_votes"]
    sentiment = row["predicted_sentiment"]
    
    if votes != 0:
        df.at[i, "Amazingmeter"] = rating
    else:
        df.at[i, "Amazingmeter"] = sentiment

In [30]:
df=df.loc[:,['customer_id','product_id','Amazingmeter']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customer_id   30000 non-null  float64
 1   product_id    30000 non-null  object 
 2   Amazingmeter  30000 non-null  object 
dtypes: float64(1), object(2)
memory usage: 703.2+ KB


# Find Unique

In [31]:
# Number of users
print('The dataset has', df['customer_id'].nunique(), 'unique users')

# Number of products
print('The dataset has', df['product_id'].nunique(), 'unique movies')

# Number of ratings
print('The dataset has', df['Amazingmeter'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(df['Amazingmeter'].unique()))

The dataset has 24391 unique users
The dataset has 21209 unique movies
The dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]


# EDA

In [32]:
# Aggregate by product_id
agg_ratings = df.groupby('product_id').agg(mean_rating = ('Amazingmeter', 'mean'),
                                                number_of_ratings = ('Amazingmeter', 'count')).reset_index()

# Keep the review with over 100 ratings
agg_ratings_GT100 = agg_ratings[agg_ratings['number_of_ratings']>20]

# Check the information of the dataframe
agg_ratings_GT100.head()       

Unnamed: 0,product_id,mean_rating,number_of_ratings
1495,B000JLNBW4,3.734375,64
1645,B000MPMHMW,3.407407,27
4311,B002MPLYEW,3.680851,47
4501,B002UYSHMM,4.26087,23
6302,B004MX8XO6,4.0,32


In [33]:
# Check top 5 products by number of reviews
agg_ratings_GT100.sort_values(by='number_of_ratings', ascending=False).head()

Unnamed: 0,product_id,mean_rating,number_of_ratings
14881,B00KJ07SEM,4.166667,222
14101,B00J46XO9U,3.816901,213
15332,B00LAG4HN4,4.06,150
17552,B00QERR5CY,3.453846,130
18256,B00SNKU7VQ,4.023256,86


In [34]:
# Merge data
df_GT100 = pd.merge(df, agg_ratings_GT100[['product_id']], on='product_id', how='inner')
df_GT100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2056 entries, 0 to 2055
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customer_id   2056 non-null   float64
 1   product_id    2056 non-null   object 
 2   Amazingmeter  2056 non-null   object 
dtypes: float64(1), object(2)
memory usage: 64.2+ KB


In [35]:
# Number of users
print('The dataset has', df_GT100['customer_id'].nunique(), 'unique users')

# Number of products with more than 20 ratings
print('The dataset has', df_GT100['product_id'].nunique(), 'unique products')

# Number of ratings
print('The dataset has', df_GT100['Amazingmeter'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(df_GT100['Amazingmeter'].unique()))

The dataset has 2045 unique users
The dataset has 40 unique products
The dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]


# Create user-item matrix

In [36]:
# Create user-item matrix
matrix = df_GT100.pivot_table(index='product_id', columns='customer_id', values='Amazingmeter')
matrix.head()

customer_id,16511.0,97309.0,111008.0,111897.0,111932.0,113520.0,113582.0,114614.0,115345.0,118282.0,...,52978754.0,52983721.0,52994052.0,53002148.0,53005424.0,53006695.0,53035324.0,53060947.0,53067301.0,53089047.0
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B000JLNBW4,,,,3.0,,,5.0,,,,...,,,,,4.0,,,,,
B000MPMHMW,,,,,,,,,,,...,,,,,,,,,,
B002MPLYEW,,,,,,,,,,5.0,...,,,,,,,,,,
B002UYSHMM,,,,,,,,,,,...,,,,,,,,,,
B004MX8XO6,,,,,,4.0,,,,,...,,,,,,,,,,


# Data Normalization

In [37]:
# Normalize user-item matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

customer_id,16511.0,97309.0,111008.0,111897.0,111932.0,113520.0,113582.0,114614.0,115345.0,118282.0,...,52978754.0,52983721.0,52994052.0,53002148.0,53005424.0,53006695.0,53035324.0,53060947.0,53067301.0,53089047.0
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B000JLNBW4,,,,-0.734375,,,1.265625,,,,...,,,,,0.265625,,,,,
B000MPMHMW,,,,,,,,,,,...,,,,,,,,,,
B002MPLYEW,,,,,,,,,,1.319149,...,,,,,,,,,,
B002UYSHMM,,,,,,,,,,,...,,,,,,,,,,
B004MX8XO6,,,,,,0.0,,,,,...,,,,,,,,,,


# Calculate Similarity Score

In [38]:
# Item similarity matrix using Pearson correlation
item_similarity = matrix_norm.T.corr()
item_similarity.head()

product_id,B000JLNBW4,B000MPMHMW,B002MPLYEW,B002UYSHMM,B004MX8XO6,B004SUXZ5U,B0055UBB4O,B005GM942C,B007V7G5TU,B0087PJE20,...,B00SNKU7VQ,B00TQYCU0Y,B00U05MVPM,B00U7XE03I,B00UF8SADQ,B00UYB67LG,B00VU7PGPQ,B00WE48O44,B00XJJU4SC,B00ZKLLZAI
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B000JLNBW4,1.0,,,,,,,,,,...,,,,,,,,,,
B000MPMHMW,,1.0,,,,,,,,,...,,,,,,,,,,
B002MPLYEW,,,1.0,,,,,,,,...,,,,,,,,,,
B002UYSHMM,,,,1.0,,,,,,,...,,,,,,,,,,
B004MX8XO6,,,,,1.0,,,,,,...,,,,,,,,,,


In [39]:
# Item similarity matrix using cosine similarity
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
item_similarity_cosine

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [40]:
# Pick a customer_ID
picked_customer_id = 113582.0

# Pick a product_id
picked_product_id = 'B000JLNBW4'

# Products that customer has bought
picked_customer_id_bought = pd.DataFrame(matrix_norm[picked_customer_id].dropna(axis=0, how='all').sort_values(ascending=False)).reset_index().rename(columns={1:'rating'})

picked_customer_id_bought.head()

Unnamed: 0,product_id,113582.0
0,B000JLNBW4,1.265625


In [42]:
# Item-based recommendation function
def item_based_rec(picked_customer_id=1, number_of_similar_items=5, number_of_recommendations =3):
    import operator
    # products that the target user has not bought
    picked_customer_id_unbought = pd.DataFrame(matrix_norm[picked_customer_id].isna()).reset_index()
    picked_customer_id_unbought = picked_customer_id_unbought[picked_customer_id_unbought[16511.0]==True]['product_id'].values.tolist()
    # products that the target user has bought
    picked_customer_id_bought1 = pd.DataFrame(matrix_norm[picked_customer_id].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()
    picked_customer_id_bought=picked_customer_id_bought1.rename(columns={16511.0: "Amazingmeter"})
    picked_customer_id_bought.head()
    # List to save the recommended products
    recommended_products = []

    # Loop through products not bought          
    for picked_product in picked_customer_id_unbought: 
    # Calculate the similarity score of the picked product with other products
        picked_product_similarity_score = item_similarity[[picked_product]].reset_index().rename(columns={picked_product:'similarity_score'})
        # Rank the similarities between the products bought by picked customer and the picked product that was not bought.
        picked_customer_id_bought_similarity = pd.merge(left=picked_customer_id_bought, 
                                                right=picked_product_similarity_score, 
                                                on='product_id', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
        # Calculate the predicted rating using weighted average of similarity scores and the ratings from customer
        predicted_rating = round(np.average(picked_customer_id_bought_similarity['Amazingmeter'], 
                                            weights=picked_customer_id_bought_similarity['similarity_score']), 6)
        # Add the recommended product to the list
        recommended_products.append(picked_product)
    # Return the top recommended products
    return recommended_products[:number_of_recommendations]

# Get recommendations
recommended_product = item_based_rec(picked_customer_id=16511.0, number_of_similar_items=20, number_of_recommendations =5)
recommended_product


['B000JLNBW4', 'B000MPMHMW', 'B002MPLYEW', 'B002UYSHMM', 'B004MX8XO6']