In [21]:
# Import libraries
import io
import pandas as pd
import os
import datetime
import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv 
from googletrans import Translator
import dask.dataframe as dd
from dask.distributed import Client

In [2]:
# Read credentials
load_dotenv()

user = os.getenv('user')
password = os.getenv('password')
host = os.getenv('host')
port = os.getenv('port')
db = os.getenv('db')

In [3]:
# Connect to PostgreSQL database
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
#engine = create_engine(f'postgresql://postgres:password@localhost:5432/final_project')

In [4]:
# Import CSVs
df_reviews = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_order_reviews_dataset.csv", parse_dates = ["review_creation_date", "review_answer_timestamp"])
df_orders = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_orders_dataset.csv", parse_dates = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'])
df_items = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_order_items_dataset.csv", parse_dates = ['shipping_limit_date'])
df_products = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_products_dataset.csv")
df_payments = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_order_payments_dataset.csv")
df_customers = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_customers_dataset.csv")

In [5]:
# Inspect first entries
print(df_reviews.head())

                          review_id                          order_id  \
0  7bc2406110b926393aa56f80a40eba40  73fc7af87114b39712e6da79b0a377eb   
1  80e641a11e56f04c1ad469d5645fdfde  a548910a1c6147796b98fdf73dbeba33   
2  228ce5500dc1d8e020d8d1322874b6f0  f9e4b658b201a9f2ecdecbb34bed034b   
3  e64fb393e7b32834bb789ff8bb30750e  658677c97b385a9be170737859d3511b   
4  f7c4243c7fe1938f181bec41a392bdeb  8e6bfb81e283fa7e4f11123a3fb894f1   

   review_score review_comment_title  \
0             4                  NaN   
1             5                  NaN   
2             5                  NaN   
3             5                  NaN   
4             5                  NaN   

                              review_comment_message review_creation_date  \
0                                                NaN           2018-01-18   
1                                                NaN           2018-03-10   
2                                                NaN           2018-02-17   
3           

In [6]:
# Inspect data
print(df_reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   review_id                100000 non-null  object        
 1   order_id                 100000 non-null  object        
 2   review_score             100000 non-null  int64         
 3   review_comment_title     11715 non-null   object        
 4   review_comment_message   41753 non-null   object        
 5   review_creation_date     100000 non-null  datetime64[ns]
 6   review_answer_timestamp  100000 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB
None


In [7]:
# Check for missing data.
print(df_reviews.isna().sum())

review_id                      0
order_id                       0
review_score                   0
review_comment_title       88285
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64


In [8]:
# Check uniqueness. There are duplicate reviews/orders, since each review/order should be one entry.
print(df_reviews.nunique())

review_id                  99173
order_id                   99441
review_score                   5
review_comment_title        4600
review_comment_message     36921
review_creation_date         637
review_answer_timestamp    99010
dtype: int64


In [9]:
# Function to identify duplicates in df subsetted on a list, sorting by the first column in the list.
def id_dup(df:"DataFrame", cols:"list") -> "DataFrame":
    mask_duplicates = df.duplicated(subset=cols, keep=False)
    df_dup = df[mask_duplicates].sort_values(cols[0])
    return df_dup

In [10]:
# Identify duplicate order_id. Each duplicated order_id corresponds to a 
# distinct review_id with different timestamps. We keep the later review as it's likely the
# user updated the review (different comments/ratings)
df_reviews_dup = id_dup(df_reviews, ['order_id'])

print(df_reviews_dup)
print(df_reviews_dup[df_reviews_dup['order_id']=='df56136b8031ecd28e200bb18e6ddb2e'])

                              review_id                          order_id  \
25802  89a02c45c340aeeb1354a24e7d4b2c1e  0035246a40f520710769010f752e7507   
22585  2a74b0559eb58fc1ff842ecc999594cb  0035246a40f520710769010f752e7507   
22946  ab30810c29da5da8045216f0f62652a2  013056cfe49763c6f66bda03396c5ee3   
69191  73413b847f63e02bc752b364f6d05ee9  013056cfe49763c6f66bda03396c5ee3   
83893  d8e8c42271c8fb67b9dad95d98c8ff80  0176a6846bcb3b0d3aa3116a9a768597   
...                                 ...                               ...   
27663  5e78482ee783451be6026e5cf0c72de1  ff763b73e473d03c321bcd5a053316e8   
41665  39de8ad3a1a494fc68cc2d5382f052f4  ff850ba359507b996e8b2fbb26df8d03   
18916  80f25f32c00540d49d57796fb6658535  ff850ba359507b996e8b2fbb26df8d03   
54388  5476dd0eaee7c4e2725cafb011aa758c  ffaabba06c9d293a3c614e0515ddbabc   
92957  870d856a4873d3a67252b0c51d79b950  ffaabba06c9d293a3c614e0515ddbabc   

       review_score review_comment_title  \
25802             5            

In [11]:
# There are no cases where duplicated order_ids have the same timestamps.
df_reviews_dup = id_dup(df_reviews, ['order_id', 'review_answer_timestamp'])

print(df_reviews_dup)

Empty DataFrame
Columns: [review_id, order_id, review_score, review_comment_title, review_comment_message, review_creation_date, review_answer_timestamp]
Index: []


In [12]:
# Remove duplicate order_ids, keeping the later review.
df_reviews = df_reviews.sort_values(['review_answer_timestamp']).drop_duplicates(subset=['order_id'], keep='last')
print(df_reviews)
print(df_reviews.nunique())

                              review_id                          order_id  \
37828  6916ca4502d6d3bfd39818759d55d536  bfbd0f9bdef84302105ad712db648a6c   
5538   49f695dffa457eaba90d388a5c37e942  e5215415bb6f76fe3b7cb68103a0d1c0   
60918  743d98b1a4782f0646898fc915ef002a  e2144124f98f3bf46939bc5183104041   
28280  53752edb26544dd41c1209f582c9c589  b8b9d7046c083150cb5360b83a8ebb51   
41350  b2d5d8db2a841d27a72e4c06c6212368  9aa3197e4887919fde0307fc23601d7a   
...                                 ...                               ...   
47274  6c50d16eb583d5db7e841b77e89b7045  0b223d92c27432930dfe407c6aea3041   
14794  abf08328d2f1f0e8a33eee9f52f502f6  99e4a1f9377bf28f08f54a2eecccbf0d   
93532  6c883909cf53725a13caf477a70f00a4  7ce4e38f4eadd993bb5b2e60bb7f7bec   
50008  728d5cfdc7283cfd0a8061d7581a19f8  7e8072dc0f35ebb0c1b2a4743e0f179a   
81228  13548d0f62b03a0d3cb6efc00877fa13  30a2f24dd6770c91faa6b3481319204b   

       review_score review_comment_title  \
37828             1            

In [13]:
# Identify duplicate review_ids. Each duplicate review corresponds to different order_id while all other data stay the same.
# Duplicate review_ids are found to be due to multiple order_ids/customer_ids being generated for multiple products, or copies
# of the same product, in the same order.
df_reviews_dup = id_dup(df_reviews, ['review_id'])
df_reviews_dup = df_reviews_dup[df_reviews_dup['review_comment_message'].notnull()]

print(df_reviews_dup)
print(df_reviews_dup.nunique())

                              review_id                          order_id  \
47045  00130cbe1f9d422698c812ed8ded1919  dfcdfc43867d1c1381bfaf62d6b9c195   
30062  00130cbe1f9d422698c812ed8ded1919  04a28263e085d399c97ae49e0b477efa   
93607  0174caf0ee5964646040cd94e15ac95e  f93a732712407c02dce5dd5088d0f47b   
57727  0174caf0ee5964646040cd94e15ac95e  74db91e33b4e1fd865356c89a61abf1f   
96833  0254bd905dc677a6078990aad3331a36  331b367bdd766f3d1cf518777317b5d9   
...                                 ...                               ...   
62661  f6a856dbc72d2a8bb09d860da9215545  2b0e07b3ff6d3a2db9405022edc865c2   
29415  fbb7c6f69326ad5bf986c099bedefdb4  4608a27e9e60c05b054631def491e7af   
68173  fbb7c6f69326ad5bf986c099bedefdb4  d351ad232c9e018edafa9f34f5d03e63   
31349  fe5c833752953fed3209646f1f63b53c  4863e15fa53273cc7219c58f5ffda4fb   
40677  fe5c833752953fed3209646f1f63b53c  d3775e436e60258e62e678a0f68a0f8d   

       review_score review_comment_title  \
47045             1            

In [None]:
# Write DataFrame to SQL
df_reviews.to_sql('reviews', con=engine, if_exists='replace', index=False)

In [None]:
# df_reviews = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_order_reviews_dataset.csv", parse_dates = ["review_creation_date", "review_answer_timestamp"])

In [None]:
# # Merge all
# df_merge = pd.merge(df_reviews, df_orders, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_items, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_products, on='product_id', how='inner')
# df_merge = pd.merge(df_merge, df_payments, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_customers, on='customer_id', how='inner')
# df_merge = df_merge.drop_duplicates(subset=['order_id', 'product_id', 'review_id'])
# df_merge = df_merge.dropna(subset=['review_id', 'order_id', 'product_id'])

# # Identify duplicates.
# df_merge_dup = id_dup(df_merge, ['order_id', 'review_id'])

# print(df_merge_dup['order_id'].value_counts())
# print(df_merge_dup['review_id'].value_counts())

In [None]:
# # One order_id to two review_ids, multiple product_ids. 
# # Will be better to drop reviews as review cannot be assigned to product.
# # If only one product, keep later timestamp.
# print(df_merge.loc[df_merge['order_id']=='5a3b1c29a49756e75f1ef513383c0c12', ['product_id', 'review_id']])
# print(df_merge[df_merge['order_id']=='5a3b1c29a49756e75f1ef513383c0c12'])

In [None]:
# # Same order/review_ids for eight product_ids
# print(df_merge.loc[df_merge['review_id']=='b8017a9ca639f71a9a4a745985f4a729', 'product_id'])
# print(df_merge[df_merge['review_id']=='b8017a9ca639f71a9a4a745985f4a729'])

In [None]:
# # Merge all
# df_merge = pd.merge(df_reviews, df_orders, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_items, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_products, on='product_id', how='inner')
# df_merge = pd.merge(df_merge, df_payments, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_customers, on='customer_id', how='inner')
# df_merge = df_merge.drop_duplicates(subset=['order_id', 'product_id', 'review_id'])
# df_merge = df_merge.dropna(subset=['review_id', 'order_id', 'product_id'])

# # Identify duplicates. Identified multiple order_ids/customer_ids from the same transaction
# # belonging to single customer_unique_id as the source of duplicate review_ids
# df_merge_dup = id_dup(df_merge, ['review_id','product_id'])
# #df_merge_dup = df_merge[mask_duplicates].sort_values('review_id').drop_duplicates(subset=['customer_unique_id'])

# print(df_merge_dup['order_id'].value_counts().head())
# print(df_merge_dup['review_id'].value_counts().head())

In [None]:
df_merge.info()

In [None]:
# df_merge.groupby('order_item_id')['order_id'].value_counts()

In [None]:
# # One review_id to multiple order_id for one product_id. Each order_id is associated with its own customer_id
# # but all customer_ids belong to the same customer_unique_id. This is due to multiple orders of the same product.
# print(df_merge.loc[df_merge['review_id']=='4219a80ab469e3fc9901437b73da3f75', 'product_id'])
# print(df_merge[df_merge['review_id']=='4219a80ab469e3fc9901437b73da3f75'])

In [None]:
# # Merge all
# df_merge = pd.merge(df_reviews, df_orders, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_items, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_products, on='product_id', how='inner')
# df_merge = pd.merge(df_merge, df_payments, on='order_id', how='inner')
# df_merge = pd.merge(df_merge, df_customers, on='customer_id', how='inner')
# df_merge = df_merge.drop_duplicates(subset=['order_id', 'product_id', 'review_id'])
# df_merge = df_merge.dropna(subset=['review_id', 'order_id', 'product_id'])

# # Identify duplicates. Identified multiple order_ids/customer_ids from the same transaction
# # belonging to single customer_unique_id as the source of duplicate review_ids
# mask_duplicates = df_merge.duplicated(subset=['review_id'], keep=False)
# df_merge_dup = df_merge[mask_duplicates].sort_values('review_id').drop_duplicates(subset=['customer_unique_id'])

# print(df_merge_dup.nunique())
# print(df_merge_dup['order_id'].value_counts().head())
# print(df_merge_dup['review_id'].value_counts().head())