In [373]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Loading Data

In [374]:
baby_meta_df= pd.read_csv('baby_meta.csv', dtype={'salesRank': dict}) 

In [375]:
baby_rating_df= pd.read_csv('ratings_Baby.csv', names = ['reviewerID', 'asin', 'overall', 'unixReviewTime']) 

In [376]:
baby_cluster_df = pd.read_csv('fake_baby_cluster.csv')[['asin','clusterId']]

# Finding Cluster Attributes

## Product Ratings

In [377]:
def create_product_rating_count_df(rating_x_meta_df):
    return pd.DataFrame(rating_x_meta_df.groupby('asin').overall.count())\
            .rename(index=str, columns={"overall": "rating_count"})
    

In [378]:
def create_product_avg_rating_df(rating_x_meta_df):
    return pd.DataFrame(rating_x_meta_df.groupby('asin').overall.mean())\
            .rename(index=str, columns={"overall": "avg_rating"})

In [379]:
def create_product_rating_summary_df(rating_df, meta_df):
    rating_x_meta_df = rating_df.join(meta_df.set_index('asin'), on='asin')
    count_df = create_product_rating_count_df(rating_x_meta_df)
    avg_df = create_product_avg_rating_df(rating_x_meta_df)
    return meta_df[['asin']].join(count_df, on='asin').join(avg_df, on='asin')

In [380]:
baby_product_rating_summary_df=create_product_rating_summary_df(baby_rating_df, baby_meta_df)

## Cluster rating attributes

In [381]:
def theil_index(array):
    mean = np.mean(array)
    count = len(array)-np.isnan(array).sum()
    return np.sum(((array/mean)*np.log(array/mean))/count)

In [382]:
def create_cluster_avg_rating_df(cluster_x_rating_summary_df):
    return cluster_x_rating_summary_df.groupby('clusterId').avg_rating.mean().to_frame()\
        .rename(index=str, columns={"avg_rating": "cluster_avg_rating"})

In [383]:
def create_cluster_rating_var_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.var())\
           .rename(index=str, columns={"avg_rating": "cluster_avg_rating_var"})

In [384]:
def create_cluster_rating_theil_index_df(cluster_x_rating_summary_df):
    return pd.DataFrame(data=cluster_x_rating_summary_df.groupby('clusterId').avg_rating.apply(theil_index))\
            .rename(index=str, columns={"avg_rating": "cluster_rating_theil_index"})

In [385]:
def create_cluster_rating_count_df(cluster_x_rating_summary_df):
    return pd.DataFrame(cluster_x_rating_summary_df.groupby('clusterId').rating_count.sum())\
            .rename(index=str, columns={"rating_count": "cluster_rating_count"})

In [386]:
def create_cluster_rating_summary_df(cluster_df, product_rating_summary_df):
    cluster_x_rating_summary_df = product_rating_summary_df.join(cluster_df.set_index('asin'), on='asin')
    avg_df = create_cluster_avg_rating_df(cluster_x_rating_summary_df)
    count_df = create_cluster_rating_count_df(cluster_x_rating_summary_df)
    var_df = create_cluster_rating_var_df(cluster_x_rating_summary_df)
    t_df = create_cluster_rating_theil_index_df(cluster_x_rating_summary_df)
    return avg_df.join(count_df).join(var_df).join(t_df).reset_index()



In [387]:
baby_cluster_rating_summary_df = create_cluster_rating_summary_df(baby_cluster_df, baby_product_rating_summary_df)

## Cluster price attributes

In [397]:
def create_cluster_price_var_df(cluster_x_meta_df):
    return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.var())\
           .rename(index=str, columns={"price": "cluster_price_var"})

In [398]:
def create_cluster_price_theil_index_df(cluster_x_meta_df):
     return pd.DataFrame(data=cluster_x_meta_df.groupby('clusterId').price.apply(theil_index))\
            .rename(index=str, columns={"price": "cluster_price_theil_index"})

In [399]:
def create_cluster_price_summary_df(cluster_df, meta_df):
    cluster_x_meta_df = meta_df.join(cluster_df.set_index('asin'), on='asin')
    var_df = create_cluster_price_var_df(cluster_x_meta_df)
    t_df = create_cluster_price_theil_index_df(cluster_x_meta_df)
    return var_df.join(t_df).reset_index()

In [400]:
baby_cluster_price_summary_df = create_cluster_price_summary_df(baby_cluster_df, baby_meta_df)

## Combine attributes

In [401]:
baby_cluster_summary_df = baby_cluster_rating_summary_df.join(baby_cluster_price_summary_df.\
                                                              set_index('clusterId'), on='clusterId')

In [402]:
baby_cluster_summary_df.head()

Unnamed: 0,clusterId,cluster_avg_rating,cluster_rating_count,cluster_avg_rating_var,cluster_rating_theil_index,cluster_price_var,cluster_price_theil_index
0,0,4.074215,8899.0,0.960449,0.033978,5778.215544,0.749301
1,1,4.027458,8235.0,1.092041,0.03995,7037.698323,0.788013
2,2,4.068809,7313.0,1.016189,0.03625,5599.692775,0.732535
3,3,4.126346,7476.0,0.946893,0.03298,7322.857408,0.784256
4,4,4.090006,7937.0,1.002177,0.035138,4900.027766,0.714846


# Output csv

In [None]:
baby_cluster_summary_df.to_csv('baby_cluster_summary.csv')