In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Loading Data

In [2]:
baby_meta_df= pd.read_csv('baby_meta.csv', dtype={'salesRank': dict}) 

In [3]:
baby_rating_df= pd.read_csv('ratings_Baby.csv', names = ['reviewerID', 'asin', 'overall', 'unixReviewTime']) 

In [64]:
baby_cluster_df = pd.read_csv('baby_4000_cluster_df.csv')[['asin', 'clusterId']]

In [65]:
products_count_df = baby_cluster_df.groupby('clusterId').count()\
                    .rename(index=str, columns={"asin": "product_count"}).reset_index()

# Finding Cluster Attributes

## Product Ratings

In [68]:
def create_product_rating_count_df(rating_x_meta_df):
    return pd.DataFrame(rating_x_meta_df.groupby('asin').overall.count())\
            .rename(index=str, columns={"overall": "rating_count"})
    

In [69]:
def create_product_avg_rating_df(rating_x_meta_df):
    return pd.DataFrame(rating_x_meta_df.groupby('asin').overall.mean())\
            .rename(index=str, columns={"overall": "avg_rating"})

In [70]:
def create_product_rating_summary_df(rating_df, meta_df):
    rating_x_meta_df = rating_df.join(meta_df.set_index('asin'), on='asin')
    count_df = create_product_rating_count_df(rating_x_meta_df)
    avg_df = create_product_avg_rating_df(rating_x_meta_df)
    return meta_df[['asin']].join(count_df, on='asin').join(avg_df, on='asin')

In [71]:
baby_product_rating_summary_df=create_product_rating_summary_df(baby_rating_df, baby_meta_df)

## Cluster rating attributes

In [72]:
def theil_index(array):
    mean = np.mean(array)
    count = len(array)-np.isnan(array).sum()
    return np.sum(((array/mean)*np.log(array/mean))/count)

In [73]:
def create_cluster_avg_rating_df(cluster_x_rating_summary_df):
    return cluster_x_rating_summary_df.groupby('clusterId').avg_rating.mean().to_frame()\
        .rename(index=str, columns={"avg_rating": "cluster_avg_rating"})

In [75]:
def create_cluster_rating_var_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.var())\
           .rename(index=str, columns={"avg_rating": "cluster_avg_rating_var"})

In [76]:
def create_cluster_rating_theil_index_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.apply(theil_index))\
            .rename(index=str, columns={"avg_rating": "cluster_rating_theil_index"})

In [77]:
def create_cluster_rating_count_df(cluster_x_rating_summary_df):
    return pd.DataFrame(cluster_x_rating_summary_df.groupby('clusterId').rating_count.sum())\
            .rename(index=str, columns={"rating_count": "cluster_rating_count"})

In [78]:
def create_cluster_rating_summary_df(cluster_df, product_rating_summary_df):
    cluster_x_rating_summary_df = cluster_df.join(product_rating_summary_df.set_index('asin'), on='asin')
    avg_df = create_cluster_avg_rating_df(cluster_x_rating_summary_df)
    count_df = create_cluster_rating_count_df(cluster_x_rating_summary_df)
    var_df = create_cluster_rating_var_df(cluster_x_rating_summary_df)
    t_df = create_cluster_rating_theil_index_df(cluster_x_rating_summary_df)
    return avg_df.join(count_df).join(var_df).join(t_df).reset_index()



In [79]:
baby_cluster_rating_summary_df = create_cluster_rating_summary_df(baby_cluster_df, baby_product_rating_summary_df)

## Cluster price attributes

In [80]:
def create_cluster_price_var_df(cluster_x_meta_df):
    return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.var())\
           .rename(index=str, columns={"price": "cluster_price_var"})

In [81]:
def create_cluster_price_theil_index_df(cluster_x_meta_df):
     return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.apply(theil_index))\
            .rename(index=str, columns={"price": "cluster_price_theil_index"})

In [82]:
def create_cluster_price_summary_df(cluster_df, meta_df):
    cluster_x_meta_df = cluster_df.join(meta_df.set_index('asin'), on='asin')
    var_df = create_cluster_price_var_df(cluster_x_meta_df)
    t_df = create_cluster_price_theil_index_df(cluster_x_meta_df)
    return var_df.join(t_df).reset_index()

In [83]:
baby_cluster_price_summary_df = create_cluster_price_summary_df(baby_cluster_df, baby_meta_df)

## Combine attributes

In [84]:
baby_cluster_summary_df = baby_cluster_rating_summary_df.join(baby_cluster_price_summary_df.\
                                                              set_index('clusterId'), on='clusterId')
#baby_cluster_summary_df['clusterId'] = baby_cluster_summary_df['clusterId'].astype(int)
baby_cluster_summary_df =baby_cluster_summary_df.join(products_count_df.set_index('clusterId'), on='clusterId')

In [85]:
baby_cluster_summary_df

Unnamed: 0,clusterId,cluster_avg_rating,cluster_rating_count,cluster_avg_rating_var,cluster_rating_theil_index,cluster_price_var,cluster_price_theil_index,product_count
0,0,4.140543,662.0,0.051883,0.001392,9.415512,0.007669,10
1,1,4.442981,482.0,0.214724,0.004376,27.535138,0.251288,10
2,2,4.060606,13.0,1.011019,0.020963,3427.920000,0.278346,3
3,3,4.290602,837.0,0.405358,0.008425,368.793600,0.263914,6
4,4,4.099010,509.0,0.814364,0.025960,7012.123626,0.689178,22
5,5,3.940119,213.0,1.278179,0.040738,841.174432,0.501709,12
6,6,4.220201,376.0,0.697694,0.023239,203.473132,0.223083,47
7,7,3.710921,128.0,1.931169,0.080136,1970.271911,0.453278,27
8,8,4.250154,83.0,1.111293,0.036756,7769.784706,1.088962,18
9,9,3.858080,822.0,1.495098,0.056556,3700.001652,0.434921,33


In [95]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 80)

In [87]:
baby_meta_cluster_df = baby_meta_df.join(baby_cluster_df.set_index('asin'), on='asin' )

In [101]:
baby_meta_cluster_df.loc[baby_meta_cluster_df['asin'] == '0980027500']

Unnamed: 0.1,Unnamed: 0,asin,categories,description,title,price,imUrl,brand,related,salesRank,clusterId
10,10,980027500,[['Baby']],This calendar provides busy parents with a simple way to note both big and l...,Nature's Lullabies First Year Sticker Calendar,9.99,http://ecx.images-amazon.com/images/I/31PYzNf0RBL._SY300_.jpg,,"{'also_bought': ['0980027594', '0980027586', '0307342301', 'B003NMTJGS', 'B0...",,2894.0


In [102]:
baby_meta_cluster_df.loc[baby_meta_cluster_df['clusterId'] == 2894]

Unnamed: 0.1,Unnamed: 0,asin,categories,description,title,price,imUrl,brand,related,salesRank,clusterId
10,10,0980027500,[['Baby']],This calendar provides busy parents with a simple way to note both big and l...,Nature's Lullabies First Year Sticker Calendar,9.99,http://ecx.images-amazon.com/images/I/31PYzNf0RBL._SY300_.jpg,,"{'also_bought': ['0980027594', '0980027586', '0307342301', 'B003NMTJGS', 'B0...",,2894.0
68,68,6151527682,[['Baby']],A girl's First Holy Communion is one of the most important events in their l...,Set of Girls First Holy Communion Veil Wreath and White Satin Purse with Cha...,,http://ecx.images-amazon.com/images/I/410ZcCJHM2L._SX300_.jpg,,"{'also_viewed': ['B0075LS9PK', 'B0071BD210', 'B000AL9CGM', 'B0039OFLKS', 'B0...",,2894.0
1289,1289,B00021D2UW,[['Baby']],"Musti Eau de Soin Spray is a soothing, refreshing scent that softens the ski...",Musti Eau De Soin 3.38 fl oz,18.99,http://ecx.images-amazon.com/images/I/318A4774J2L._SY300_.jpg,Mustela,"{'also_bought': ['B002JXJ9J4', 'B00021EA9Y', 'B0072CVIQE', 'B0011GD5CM', 'B0...",,2894.0
4795,4795,B000GK7BVA,[['Baby']],"Hug & Feel, Shake & Chew, Clip and attach linko shapes together to create an...",Kushies Zolo Linko,19.83,http://ecx.images-amazon.com/images/I/41AjPYzqraL._SY300_.jpg,Kushies,"{'also_bought': ['B000I66ALO', 'B001CZ8O64', 'B000GK5ZCM', 'B0001NEAB6', 'B0...",,2894.0
9448,9448,B000VYRJMC,[['Baby']],"Disney Pooh ""Delightful Day"" Wall Hanging 3 Piece Set features that silly ol...",Disney Pooh Wall Hanging Delightful Day,18.83,http://ecx.images-amazon.com/images/I/41lndnzXkJL._SY300_.jpg,,"{'also_bought': ['B000VYX28M', 'B000VYX28C', 'B000W7O7DM', 'B008VPA77M'], 'b...",,2894.0
9520,9520,B000W7MIZQ,[['Baby']],"The Disney Pooh ""Delightful Day"" Wallpaper Border is a double roll of 30 fee...",Disney Pooh Wall Border Delightful Day,19.99,http://ecx.images-amazon.com/images/I/41ouKLzHe9L._SY300_.jpg,,"{'also_bought': ['B000VYX28M', 'B008VPA77M', 'B000VYX28C', 'B000W7O7DM'], 'b...",,2894.0
10824,10824,B00115S410,[['Baby']],Bright Starts Petal Pusher Carrier Toy Bar is a whimsical pink friend which ...,Bright Starts Petal Pusher Carrier Toy Bar,16.0,http://ecx.images-amazon.com/images/I/41UpvaQFjbL._SX300_.jpg,Bright Starts,"{'also_bought': ['B008J1QOI4', 'B00C78G0ZC', 'B005BW1RL8', 'B0088EUA82', 'B0...",,2894.0
19132,19132,B0029985J8,[['Baby']],This adorable Baby Lulu purple and orange floral print sundress is perfect f...,Baby Lulu Spring/Summer Girls Purple and Orange Floral Print Sundress-24 Months,36.0,http://ecx.images-amazon.com/images/I/412sGO11seL._SY300_.jpg,,"{'also_viewed': ['B0089ZUMSI', 'B00CBWTPWY', 'B00AG0WAP2', 'B00CBWTQ9Q', 'B0...",,2894.0
28601,28601,B003N64Z86,[['Baby']],The Jeep car seat cover brings fashion as well as practicality. It is design...,Jeep Car Seat Cover,9.97,http://ecx.images-amazon.com/images/I/51c3h352mbL._SY300_.jpg,,"{'also_bought': ['B00586AMP4'], 'also_viewed': ['B004JU0CFU'], 'buy_after_vi...",,2894.0
29100,29100,B003S0NTDE,[['Baby']],Bib Care Bears Interlock Assorted Case Pack 144Please note: If there is a co...,New - Bib Care Bears Interlock Assorted Case Pack 144 - 490512,9.99,http://ecx.images-amazon.com/images/I/415WmcqMXaL._SY300_.jpg,,"{'also_viewed': ['B002A21AAK', 'B00EYCHIPU', 'B0012G7QBC', 'B005CWDNDM', 'B0...",,2894.0


# Output csv

In [None]:
baby_cluster_summary_df.to_csv('baby_cluster_summary.csv')