In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Loading Data

In [3]:
baby_meta_df= pd.read_csv('baby_meta.csv', dtype={'salesRank': dict}) 

In [4]:
baby_cluster_df = pd.read_csv('baby_4000_cluster_df.csv')[['asin', 'clusterId']]

In [5]:
products_count_df = baby_cluster_df.groupby('clusterId').count()\
                    .rename(index=str, columns={"asin": "product_count"}).reset_index()

# Finding Cluster Attributes

## Product Summary

In [13]:
baby_product_rating_summary_df=pd.read_csv('baby_product_rating_summary.csv')[['asin', 'rating_count', 'avg_rating']]

In [16]:
baby_product_summary_df = baby_product_rating_summary_df.join(baby_meta_df.set_index('asin'), on='asin')\
                            .join(baby_cluster_df.set_index('asin'), on='asin')

## Cluster rating attributes

In [17]:
def theil_index(array):
    mean = np.mean(array)
    count = len(array)-np.isnan(array).sum()
    return np.sum(((array/mean)*np.log(array/mean))/count)

In [18]:
def create_cluster_avg_rating_df(cluster_x_rating_summary_df):
    return cluster_x_rating_summary_df.groupby('clusterId').avg_rating.mean().to_frame()\
        .rename(index=str, columns={"avg_rating": "cluster_avg_rating"})

In [19]:
def create_cluster_rating_var_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.var())\
           .rename(index=str, columns={"avg_rating": "cluster_avg_rating_var"})

In [20]:
def create_cluster_rating_theil_index_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.apply(theil_index))\
            .rename(index=str, columns={"avg_rating": "cluster_rating_theil_index"})

In [21]:
def create_cluster_rating_count_df(cluster_x_rating_summary_df):
    return pd.DataFrame(cluster_x_rating_summary_df.groupby('clusterId').rating_count.sum())\
            .rename(index=str, columns={"rating_count": "cluster_rating_count"})

In [22]:
def create_cluster_rating_summary_df(cluster_df, product_rating_summary_df):
    cluster_x_rating_summary_df = cluster_df.join(product_rating_summary_df.set_index('asin'), on='asin')
    avg_df = create_cluster_avg_rating_df(cluster_x_rating_summary_df)
    count_df = create_cluster_rating_count_df(cluster_x_rating_summary_df)
    var_df = create_cluster_rating_var_df(cluster_x_rating_summary_df)
    t_df = create_cluster_rating_theil_index_df(cluster_x_rating_summary_df)
    return avg_df.join(count_df).join(var_df).join(t_df).reset_index()



In [23]:
baby_cluster_rating_summary_df = create_cluster_rating_summary_df(baby_cluster_df, baby_product_rating_summary_df)

## Cluster price attributes

In [24]:
def create_cluster_price_var_df(cluster_x_meta_df):
    return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.var())\
           .rename(index=str, columns={"price": "cluster_price_var"})

In [25]:
def create_cluster_price_theil_index_df(cluster_x_meta_df):
     return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.apply(theil_index))\
            .rename(index=str, columns={"price": "cluster_price_theil_index"})

In [26]:
def create_cluster_price_summary_df(cluster_df, meta_df):
    cluster_x_meta_df = cluster_df.join(meta_df.set_index('asin'), on='asin')
    var_df = create_cluster_price_var_df(cluster_x_meta_df)
    t_df = create_cluster_price_theil_index_df(cluster_x_meta_df)
    return var_df.join(t_df).reset_index()

In [27]:
baby_cluster_price_summary_df = create_cluster_price_summary_df(baby_cluster_df, baby_meta_df)

## Combine attributes

In [28]:
pd.options.display.max_columns = 20
pd.options.display.max_rows = 800

In [29]:
baby_cluster_summary_df = baby_cluster_rating_summary_df.join(baby_cluster_price_summary_df.\
                                                              set_index('clusterId'), on='clusterId')
#baby_cluster_summary_df['clusterId'] = baby_cluster_summary_df['clusterId'].astype(int)
baby_cluster_summary_df =baby_cluster_summary_df.join(products_count_df.set_index('clusterId'), on='clusterId')
baby_cluster_summary_df['ratings per product']= baby_cluster_summary_df['cluster_rating_count']/baby_cluster_summary_df['product_count']

In [30]:
baby_cluster_summary_df.sort_values(by='cluster_avg_rating')

Unnamed: 0,clusterId,cluster_avg_rating,cluster_rating_count,cluster_avg_rating_var,cluster_rating_theil_index,cluster_price_var,cluster_price_theil_index,product_count,ratings per product
1017,1017,1.000000,5.0,,0.000000,,0.000000e+00,1,5.000000
3697,3697,1.000000,6.0,,0.000000,,0.000000e+00,1,6.000000
1191,1191,1.000000,5.0,,0.000000,,0.000000e+00,1,5.000000
3117,3117,1.266667,15.0,,0.000000,,0.000000e+00,1,15.000000
1372,1372,1.307692,13.0,,0.000000,,0.000000e+00,1,13.000000
3963,3963,1.333333,6.0,,0.000000,,0.000000e+00,1,6.000000
1622,1622,1.375000,8.0,,0.000000,,0.000000e+00,1,8.000000
1414,1414,1.400000,5.0,,0.000000,,0.000000e+00,1,5.000000
1856,1856,1.666667,6.0,,0.000000,,0.000000e+00,1,6.000000
3313,3313,1.666667,6.0,,0.000000,,0.000000e+00,1,6.000000


In [31]:
#baby_cluster_df.loc[baby_cluster_df['clusterId'].isin(filtered['clusterId'].values.astype(int))]\
#.to_csv('filtered_baby_cluster_df.csv')

# Viewing Clusters and Products of interest

In [33]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 300)

## View single product ('asin')

In [171]:
baby_product_summary_df.loc[baby_product_summary_df['asin'] == '0980027500']

Unnamed: 0.1,asin,rating_count,avg_rating,Unnamed: 0,categories,description,title,price,imUrl,brand,related,salesRank
10,980027500,12.0,4.583333,10,[['Baby']],This calendar provides busy parents with a simple way to note both big and little moments from baby's first year. A ...,Nature's Lullabies First Year Sticker Calendar,9.99,http://ecx.images-amazon.com/images/I/31PYzNf0RBL._SY300_.jpg,,"{'also_bought': ['0980027594', '0980027586', '0307342301', 'B003NMTJGS', 'B004LE8TAE', '0307461971', '1593596103', '...",


## View products by cluster ('clusterId')

In [34]:
baby_product_summary_df.loc[baby_product_summary_df['clusterId'] == 1928]

Unnamed: 0.1,asin,rating_count,avg_rating,Unnamed: 0,categories,description,title,price,imUrl,brand,related,salesRank,clusterId
462,B00005BU51,99.0,4.525253,462,[['Baby']],"""Size It"" Closet Organizers: These size dividers make it simple to keep the whole wardrobe neat and orderly. Five blank plastic size dividers with removable size labels from Newborn to Size 8. Made in USA.","Baby Buddy Size-It Closet Organizers, Blue, 5 Pack",6.55,http://ecx.images-amazon.com/images/I/51zjAMMZXPL._SY300_.jpg,Baby Buddy,"{'also_bought': ['B0056KOIFW', 'B003Y737DS', 'B00FXNACTI', 'B00IB7DPII', 'B002UD65XQ', 'B0084DQFPE', 'B007ZZCRJ0', 'B009EDSWJA', 'B005MI648C', 'B00295MQLU', 'B0037KMSVO', 'B0045I6IA4', 'B007OBW9N4', 'B00DGN23UI', 'B0009JB3A8', 'B004L5ZW6W', 'B004HM4648', 'B000XV7ORI', 'B000RI8Y30', 'B0038JDVCE',...",,1928.0
3514,B000E1PTSI,10.0,4.8,3514,[['Baby']],mud pie&#xBF; Baby Prince First Tooth and Curl Treasure Box Set. Ceramic hinged keepsake boxes with fired silver accents.,Mud Pie Baby Prince First Tooth and Curl Treasure Box Set,19.95,http://ecx.images-amazon.com/images/I/31GKTk1tmAL._SX300_.jpg,Mud Pie,"{'also_bought': ['B000E1W8I2', 'B000STQ6Y2', 'B000J3IFIW', 'B0036ZBG7W', 'B006WSYZ8W', 'B000SHTK9C', 'B009M87G3A', 'B008OPCRJU', 'B00BJW9QV2', 'B001TQPC7K', 'B000SHX09I', 'B001CZESWI', 'B00BGF4JY6', 'B00AWAF9QS', 'B006Z6E8AG', 'B0038KNAIS', 'B0055PIQH4', 'B000H612Y0', 'B000CSBP3G', 'B000KFWWLU',...",,1928.0
3516,B000E1W8I2,5.0,4.6,3516,[['Baby']],mud pie&#xBF; baby Princess First Tooth and Curl Treasure Box Set. Ceramic hinged keepsake boxes with fired silver accents.,Mud Pie Baby Princess First Tooth and Curl Treasure Box Set,20.77,http://ecx.images-amazon.com/images/I/41ThLcVW2-L._SY300_.jpg,Mud Pie,"{'also_bought': ['B000E1PTSI', 'B001TQT4ME', 'B00BJW9QV2', 'B000STQ6Y2', 'B000J3IFIW', 'B001CZESWI', 'B00F0HNMVM', 'B0036ZBG7W', 'B00B4I5E9Y', 'B00005NK1F', 'B00EWSV1D6', 'B009M87G3A', 'B000TQ0XIO', 'B008OPCRJU', 'B00AWAF9QS', 'B001F04GQS', 'B0032LVZ8K', 'B00740HN7W', 'B00205JY78', 'B000SHTK6U',...",,1928.0
8575,B000SHTK6U,5.0,5.0,8575,[['Baby']],mud pie&#xBF; Baby Princess First Brush and Toothpaste Treasure Box Set. Ceramic with fired silver accents on these adorable hinged treasure boxes for Princess' first tooth and curl.,Mud Pie Baby Princess First Brush and Toothpaste Treasure Box Set,21.96,http://ecx.images-amazon.com/images/I/31cLhSiMW0L._SX300_.jpg,Mud Pie,"{'also_bought': ['B000SHX09I', 'B001TQT4ME', 'B000STQ6Y2', 'B000J3IFIW', 'B000TQ0XIO', 'B000E1W8I2', 'B0018791CC', 'B002W5HN0G', 'B00CGJ30FK', 'B006FSK0PQ', 'B0002AZ080', 'B003CH8W80', 'B0036ZBG7W', 'B000E1PTSI', 'B006VT0MJS', 'B004LVP0N6', 'B008KR0QZE', 'B009G6TARI', 'B000GZJJ38', 'B000WIPN66']...",,1928.0
8583,B000SHX09I,5.0,4.2,8583,[['Baby']],mud pie&#xBF; Baby Prince First Brush and Toothpaste Treasure Box Set. Ceramic with fired silver accents on these adorable hinged treasure boxes for Prince's first tooth and curl.,Mud Pie Baby Prince First Brush and Toothpaste Treasure Box Set,17.05,http://ecx.images-amazon.com/images/I/31MDWPaWNiL._SX300_.jpg,Mud Pie,"{'also_bought': ['B000SHTK6U', 'B000E1PTSI', 'B0036ZBG7W', 'B000STQ6Y2', 'B000E1W8I2', 'B008CP293G', 'B001TQPC7K', 'B000KFWWLU', 'B000066665', 'B001579YRM', 'B003CT36NE'], 'also_viewed': ['B000E1PTSI', 'B000E1W8I2', 'B00BJW9QV2', 'B0055PIQH4', 'B000STQ6Y2', 'B00F0HNMVM', 'B003CH8W80', 'B00AWAF9Q...",,1928.0
30145,B003Y737DS,50.0,4.7,30145,[['Baby']],"""Size It"" Closet Organizers: These size dividers make it simple to keep the whole wardrobe neat and orderly. Five blank plastic size dividers with removable size labels from Newborn to Size 8. Made in USA.","Baby Buddy &quot;Size-It&quot; Closet Organizers, Pink",4.55,http://ecx.images-amazon.com/images/I/51anXMThppL._SY300_.jpg,Baby Buddy,"{'also_bought': ['B00005BU51', 'B0056KOIFW', 'B0076PHZJQ', 'B0037KMSVE', 'B00B8ZWNBU', 'B0045I6IA4', 'B00FQ5NT02', 'B004W7VUOM', 'B00IB7DPII', 'B00FXNAD6U', 'B003YB09R6', 'B00ILITQ7G', 'B00A40IOOA', 'B0037KMSUU', 'B009EDSWJA', 'B005MI648C', 'B00BDQSWCS', 'B004L5ZW6W', 'B002UD65XQ', 'B0009JB3A8',...",,1928.0
38518,B00516GACU,24.0,4.333333,38518,[['Baby']],"Treasure your little one's special moments and firsts in this Baby Memory Book that has everything you need to easily track your baby's development and story. Pages are labeled for everything you want to remember. The cover is green with dots, stripes and baby animals in every color. Give a gift...","C.R. Gibson First 5 Years Keepsake Baby Memory Book, Tiny Bundle",24.89,http://ecx.images-amazon.com/images/I/41bRBJyrDZL._SY300_.jpg,,"{'also_bought': ['B0053HSL9C', 'B009EDSWJA'], 'buy_after_viewing': ['B001AL7GOQ', 'B0053HSL9C', 'B004LE8TWM', 'B0034G5UZC']}",,1928.0
38533,B00516G8EA,9.0,4.555556,38533,[['Baby']],Treasure your little one's special moments and firsts in this Baby Memory Book that has everything you need to easily track your baby's development and story. Pages are labeled for everything you want to remember. Give a gift that will become a favorite keepsake.,"Carter's First 5 Years Keepsake Baby Memory Book, Flitter",24.89,http://ecx.images-amazon.com/images/I/41Wa-luyjcL._SY300_.jpg,,"{'also_viewed': ['B001AL7GOQ', 'B00AYZFM86', 'B0034G5UZC', 'B00AYZFGZA', 'B004LE8T9U', 'B004E79JVG', 'B000G82HAW', 'B00AYZFKME', 'B007K5LZ6Q', 'B00EIJ4U6Y', 'B002W5HN0G', 'B002AX5IR0', 'B00516GAW0', 'B00IK1TYTE', 'B00AYZFFQU', 'B00DCFZFNM', 'B0083QCAXS', 'B004LE8TWM', 'B004LE8TA4', '030746542X',...",,1928.0
53724,B0095KA2LI,8.0,4.75,53724,[['Baby']],"AboutBaby Buddy Size-It Closet Organizers - SAGE (2 pack)""Size It"" Closet Organizers make it simple to keep the whole wardrobe neat and orderly. 2 packs of five blank plastic size dividers with removable size labels from Newborn to Size 8.Made in USA.",Baby Buddy &quot;Size-It&quot; Closet Organizers - SAGE (10 count),10.16,http://ecx.images-amazon.com/images/I/41goxcClG7L._SY300_.jpg,,"{'also_bought': ['B0094U9S46', 'B0056KOIFW', 'B003Y737DS', 'B00005BU51', 'B005BUF8GK', 'B0037KMSVO', 'B00A40IOOA', 'B004L5ZW6W', 'B009BBD1LE', 'B002UD65XQ', 'B000138GNY', 'B002UHJDAY', 'B0009JB3A8', 'B000RO3FI8', 'B0040QVFCM', 'B00I0NKG2Q', 'B00D4LFDX6', '0345492595', 'B00HT6E6GW', 'B00295MQLU',...",,1928.0


# Output csv

In [175]:
baby_cluster_summary_df.to_csv('baby_cluster_summary.csv')
baby_product_summary_df.to_csv('baby_product_summary.csv')