# DATA ANALYSIS - AMAZON REVIEWS
____
by Vitor Flisch Cavalanti<br>
May 2021

<b>Case study Sr. Business Analyst</b>

<hr><h2>1. Concatenating Reviews</h2><hr>

In [2]:
import pandas as pd
import numpy as np 
import glob 

In [3]:
# loading all reviews*.csv
path = r'..\exports'                     
all_files = glob.glob(os.path.join(path, "reviews*.csv"))    
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)

In [4]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12284762 entries, 0 to 12284761
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Unnamed: 0     int64  
 1   reviewerID     object 
 2   asin           object 
 3   helpful        object 
 4   overall        float64
 5   reviewTime     object 
 6   file           object 
 7   helpful_score  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 749.8+ MB


In [5]:
concatenated_df.count()

Unnamed: 0       12284762
reviewerID       12284762
asin             12284762
helpful          12284762
overall          12284762
reviewTime       12284762
file             12284762
helpful_score     6571550
dtype: int64

In [6]:
asin_df = concatenated_df[['asin','file']]

In [7]:
concatenated_df.drop(['file'], axis="columns",inplace=True)

In [8]:
# saving concatenated reviews to final folder
concatenated_df.to_csv("../exports/final/reviews_concatenated.csv")

In [9]:
# removing duplicates
asin_df = asin_df.drop_duplicates()

In [10]:
# checking number of unique products with reviews
asin_df.count()

asin    481035
file    481035
dtype: int64

In [11]:
# opening metadata
metadata = pd.read_csv("../exports/metadata_full.csv")

In [12]:
# joining metadata with unique list of products with reviews
merge = pd.merge(asin_df, metadata,on=['asin'],  how='inner')

In [14]:
merge.count()

asin                  481035
file                  481035
Unnamed: 0            481035
title                 377790
price                 459076
brand                  36517
salesrank_category    481035
salesrank_value       481035
main_category         481035
dtype: int64

In [15]:
# saving metadata to final folder
merge.to_csv("../exports/final/metadata_final.csv")
del merge
del metadata
del asin_df

<hr><h2>2. Get Overall review by year for Products Bought Together</h2><hr>

In [16]:
# create separate year column for future aggregation
concatenated_df['reviewYear'] = concatenated_df['reviewTime'].str.slice(start=0, stop=4)

In [17]:
# create aggregated data frame by product / year and average review
df_agg = concatenated_df.groupby(['asin','reviewYear'])['overall'].mean().reset_index()

In [18]:
df_agg.head()

Unnamed: 0,asin,reviewYear,overall
0,000100039X,2000,4.727273
1,000100039X,2001,5.0
2,000100039X,2002,4.5
3,000100039X,2003,5.0
4,000100039X,2004,4.7


In [19]:
# read metadata related file
metadata_related = pd.read_csv("../exports/metadata_related_full.csv")

In [20]:
# join metadata_related with annual average by product
metadata_join = pd.merge(metadata_related, df_agg, on=['asin'],  how='inner')

In [21]:
# renaming columns to simplify future joins
metadata_join.rename(columns = {'asin': 'asin_1', 'overall': 'overall_1'}, inplace = True)
metadata_join.rename(columns = {'bought_together': 'asin'}, inplace = True)

In [22]:
# join for the bought together products
metadata_join2 = pd.merge(metadata_join, df_agg, on=['asin','reviewYear'],  how='inner')

In [23]:
# renaming columns and save csv
metadata_join2.rename(columns = {'asin_1': 'asin_prod',  'asin': 'asin_bought_together', 'overall_1': 'overall_prod', 'overall': 'overall_bought_together'}, inplace = True)
metadata_join2.to_csv("../exports/final/metadata_related.csv")
del concatenated_df
del df_agg
del metadata_join
del metadata_join2
del metadata_related