# DATA ANALYSIS - AMAZON REVIEWS
____
by Vitor Flisch Cavalanti<br>
May 2021

<b>Case study Sr. Business Analyst</b>

<hr><h2>1. Concatenating Reviews</h2><hr>

In [1]:
import pandas as pd
import numpy as np 
import glob 

In [2]:
# loading all reviews*.csv
path = r'..\exports'                     
all_files = glob.glob(os.path.join(path, "reviews*.csv"))    
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)

In [3]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6781787 entries, 0 to 6781786
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Unnamed: 0     int64  
 1   reviewerID     object 
 2   asin           object 
 3   helpful        object 
 4   overall        float64
 5   reviewTime     object 
 6   helpful_score  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 362.2+ MB


In [4]:
concatenated_df.count()

Unnamed: 0       6781787
reviewerID       6781787
asin             6781787
helpful          6781787
overall          6781787
reviewTime       6781787
helpful_score    3992217
dtype: int64

In [5]:
# saving concatenated reviews to final folder
concatenated_df.to_csv("../exports/final/reviews_concatenated.csv")

In [6]:
asin_df = concatenated_df[['asin']]

In [7]:
# removing duplicates
asin_df = asin_df.drop_duplicates()

In [8]:
# checking number of unique products with reviews
asin_df.count()

asin    113052
dtype: int64

In [9]:
# opening metadata
metadata = pd.read_csv("../exports/metadata_full.csv")

In [10]:
# joining metadata with unique list of products with reviews
merge = pd.merge(asin_df, metadata,on=['asin'],  how='inner')

In [11]:
merge.count()

asin                  113052
Unnamed: 0            113052
title                  91486
price                 106852
brand                  36499
salesrank_category    113052
salesrank_value       113052
main_category         113052
dtype: int64

In [13]:
# saving metadata to final folder
merge.to_csv("../exports/final/metadata_final.csv")
del merge
del metadata
del asin_df

<hr><h2>2. Get Overall review by year for Products Bought Together</h2><hr>

In [14]:
# create separate year column for future aggregation
concatenated_df['reviewYear'] = concatenated_df['reviewTime'].str.slice(start=0, stop=4)

In [15]:
# create aggregated data frame by product / year and average review
df_agg = concatenated_df.groupby(['asin','reviewYear'])['overall'].mean().reset_index()

In [16]:
df_agg.head()

Unnamed: 0,asin,reviewYear,overall
0,5019281,2000,4.0
1,5019281,2001,4.0
2,5019281,2002,5.0
3,5019281,2003,5.0
4,5019281,2004,3.0


In [17]:
# read metadata related file
metadata_related = pd.read_csv("../exports/metadata_related_full.csv")

In [18]:
# join metadata_related with annual average by product
metadata_join = pd.merge(metadata_related, df_agg, on=['asin'],  how='inner')

In [19]:
# renaming columns to simplify future joins
metadata_join.rename(columns = {'asin': 'asin_1', 'overall': 'overall_1'}, inplace = True)
metadata_join.rename(columns = {'bought_together': 'asin'}, inplace = True)

In [21]:
# join for the bought together products
metadata_join2 = pd.merge(metadata_join, df_agg, on=['asin','reviewYear'],  how='inner')

In [23]:
# renaming columns and save csv
metadata_join2.rename(columns = {'asin_1': 'asin_prod',  'asin': 'asin_bought_together', 'overall_1': 'overall_prod', 'overall': 'overall_bought_together'}, inplace = True)
metadata_join2.to_csv("../exports/final/metadata_related.csv")
del concatenated_df
del df_agg
del metadata_join
del metadata_join2
del metadata_related