# Finding a relationship between the category of products and reviews of customers

In [99]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [10]:
df_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
df_orders = pd.read_csv('data/olist_orders_dataset.csv')
df_items = pd.read_csv('data/olist_order_items_dataset.csv')
df_products = pd.read_csv('data/olist_products_dataset.csv')

In [28]:
df_reviews.dtypes

review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object

In [29]:
df_orders.dtypes

order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

In [30]:
df_items.dtypes

order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

In [31]:
df_products.dtypes

product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

In [45]:
df = df_reviews[['order_id','review_score']].merge(df_orders[['order_id']], on='order_id', how='left').merge(df_items[['order_id','product_id']], on='order_id', how='left').merge(df_products[['product_id','product_category_name']], on='product_id', how='left')
df.head()

Unnamed: 0,order_id,review_score,product_id,product_category_name
0,73fc7af87114b39712e6da79b0a377eb,4,fd25ab760bfbba13c198fa3b4f1a0cd3,esporte_lazer
1,73fc7af87114b39712e6da79b0a377eb,4,fd25ab760bfbba13c198fa3b4f1a0cd3,esporte_lazer
2,a548910a1c6147796b98fdf73dbeba33,5,be0dbdc3d67d55727a65d4cd696ca73c,informatica_acessorios
3,f9e4b658b201a9f2ecdecbb34bed034b,5,d1c427060a0f73f6b889a5c7c61f2ac4,informatica_acessorios
4,658677c97b385a9be170737859d3511b,5,52c80cedd4e90108bf4fa6a206ef6b03,ferramentas_jardim


In [71]:
dfnew = df[['review_score', 'product_category_name']]
dfnew.head()

Unnamed: 0,review_score,product_category_name
0,4,esporte_lazer
1,4,esporte_lazer
2,5,informatica_acessorios
3,5,informatica_acessorios
4,5,ferramentas_jardim


In [75]:
# Get rid of products without category
dfnew = dfnew.dropna(axis=0, how='any')

In [79]:
# Sort data by category
dfnew = dfnew.set_index('product_category_name')
dfnew.sort_values(by='product_category_name')

Unnamed: 0_level_0,review_score
product_category_name,Unnamed: 1_level_1
agro_industria_e_comercio,1
agro_industria_e_comercio,5
agro_industria_e_comercio,4
agro_industria_e_comercio,3
agro_industria_e_comercio,5
...,...
utilidades_domesticas,4
utilidades_domesticas,5
utilidades_domesticas,5
utilidades_domesticas,3


In [80]:
dfnew.value_counts()

review_score
5               62782
4               21080
1               14488
3                9442
2                3918
dtype: int64

Sort data by category, including the number of items on the side

In [81]:
dfnew.pivot_table(index=['product_category_name'], aggfunc='size')

product_category_name
agro_industria_e_comercio     212
alimentos                     510
alimentos_bebidas             280
artes                         209
artes_e_artesanato             24
                             ... 
sinalizacao_e_seguranca       199
tablets_impressao_imagem       83
telefonia                    4550
telefonia_fixa                265
utilidades_domesticas        6989
Length: 73, dtype: int64

In [112]:
# Calculate the average review score for each category
avgscore = dfnew.groupby(['product_category_name']).mean()

avgscore.head()

Unnamed: 0_level_0,review_score
product_category_name,Unnamed: 1_level_1
agro_industria_e_comercio,4.0
alimentos,4.145098
alimentos_bebidas,4.303571
artes,3.91866
artes_e_artesanato,4.125


In [114]:
# Sort according to highest average review score to lowest average review score
avgscore = avgscore.sort_values(by='review_score', ascending=False)
avgscore

Unnamed: 0_level_0,review_score
product_category_name,Unnamed: 1_level_1
cds_dvds_musicais,4.642857
fashion_roupa_infanto_juvenil,4.500000
livros_interesse_geral,4.439421
livros_importados,4.400000
construcao_ferramentas_ferramentas,4.359223
...,...
casa_conforto_2,3.366667
pc_gamer,3.333333
portateis_cozinha_e_preparadores_de_alimentos,3.266667
fraldas_higiene,3.256410


In [122]:
print("Highest average", avgscore.iloc[0])
print()
print("Lowest average", avgscore.iloc[-1])

Highest average review_score    4.642857
Name: cds_dvds_musicais, dtype: float64

Lowest average review_score    2.5
Name: seguros_e_servicos, dtype: float64


According to Google Translate, these are "CDs, DVDs, 