In [1]:
"""
From this dataset we found that one order might have multiple comment. 
We need to get the earlist comment in every order which might better 
represent the user experience from the shopping website compared with 
the latest one. Latest comment might focus more on the product rather 
than the website platform. As described in the requirements, We aim 
to improve the positive reviews rate of the website platform. 
Therefore, we choose to keep the earlist review of the order.
"""

import pandas as pd
import numpy as np

reviews_dataset = pd.read_csv('olist_order_reviews_dataset.csv')
reviews_dataset.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,97ca439bc427b48bc1cd7177abe71365,00010242fe8c5a6d1ba2dd792cb16214,5,,"Perfeito, produto entregue antes do combinado.",21/09/2017 00:00,22/09/2017 10:57
1,7b07bacd811c4117b742569b04ce3580,00018f77f2f0320c557190d7a144bdd3,4,,,13/05/2017 00:00,15/05/2017 11:34
2,0c5b33dea94867d1ac402749e5438e8b,000229ec398224ef6ca0657da4fc703e,5,,Chegou antes do prazo previsto e o produto sur...,23/01/2018 00:00,23/01/2018 16:06
3,f4028d019cb58564807486a6aaf33817,00024acbcdf0a6daa1e931b038114c75,4,,,15/08/2018 00:00,15/08/2018 16:39
4,940144190dcba6351888cafa43f3a3a5,00042b26cf59d7ce69dfabb4e55b4fd9,5,,Gostei pois veio no prazo determinado .,02/03/2017 00:00,03/03/2017 10:54


In [2]:
len(reviews_dataset['order_id'])

89999

In [3]:
reviews_dataset= reviews_dataset.drop(['review_id','review_comment_title','review_comment_message','review_creation_date'],axis=1)
reviews_dataset.head()

Unnamed: 0,order_id,review_score,review_answer_timestamp
0,00010242fe8c5a6d1ba2dd792cb16214,5,22/09/2017 10:57
1,00018f77f2f0320c557190d7a144bdd3,4,15/05/2017 11:34
2,000229ec398224ef6ca0657da4fc703e,5,23/01/2018 16:06
3,00024acbcdf0a6daa1e931b038114c75,4,15/08/2018 16:39
4,00042b26cf59d7ce69dfabb4e55b4fd9,5,03/03/2017 10:54


In [4]:
#order_id is de-duplicated; only retain 'Min' time
reviews_duplication = reviews_dataset.groupby(['order_id'])['review_answer_timestamp'].min().reset_index()
reviews_duplication 

Unnamed: 0,order_id,review_answer_timestamp
0,00010242fe8c5a6d1ba2dd792cb16214,22/09/2017 10:57
1,00018f77f2f0320c557190d7a144bdd3,15/05/2017 11:34
2,000229ec398224ef6ca0657da4fc703e,23/01/2018 16:06
3,00024acbcdf0a6daa1e931b038114c75,15/08/2018 16:39
4,00042b26cf59d7ce69dfabb4e55b4fd9,03/03/2017 10:54
...,...,...
89547,fffc94f6ce00a00581880bf54a75a037,14/05/2018 12:53
89548,fffcd46ef2263f404302a634eb57f7eb,25/07/2018 09:25
89549,fffce4705a9662cd70adb13d4a31832d,29/10/2017 21:33
89550,fffe18544ffabc95dfada21779c9644f,18/08/2017 12:24


In [5]:
#Determine if the earliest comment is ‘Min’ time
reviews_duplication[reviews_duplication.order_id == 'df56136b8031ecd28e200bb18e6ddb2e']

Unnamed: 0,order_id,review_answer_timestamp
78235,df56136b8031ecd28e200bb18e6ddb2e,09/02/2017 09:07


In [6]:
#Determine if the earliest comment is ‘Min’ time
reviews_dataset[reviews_dataset.order_id == 'df56136b8031ecd28e200bb18e6ddb2e']

Unnamed: 0,order_id,review_score,review_answer_timestamp
78630,df56136b8031ecd28e200bb18e6ddb2e,5,10/02/2017 10:46
78631,df56136b8031ecd28e200bb18e6ddb2e,5,09/02/2017 09:07
78632,df56136b8031ecd28e200bb18e6ddb2e,5,14/02/2017 13:58


In [7]:
reviews_duplication

Unnamed: 0,order_id,review_answer_timestamp
0,00010242fe8c5a6d1ba2dd792cb16214,22/09/2017 10:57
1,00018f77f2f0320c557190d7a144bdd3,15/05/2017 11:34
2,000229ec398224ef6ca0657da4fc703e,23/01/2018 16:06
3,00024acbcdf0a6daa1e931b038114c75,15/08/2018 16:39
4,00042b26cf59d7ce69dfabb4e55b4fd9,03/03/2017 10:54
...,...,...
89547,fffc94f6ce00a00581880bf54a75a037,14/05/2018 12:53
89548,fffcd46ef2263f404302a634eb57f7eb,25/07/2018 09:25
89549,fffce4705a9662cd70adb13d4a31832d,29/10/2017 21:33
89550,fffe18544ffabc95dfada21779c9644f,18/08/2017 12:24


In [8]:
#Merge the order_id de-duplicated table with the table with score
cleaned_reviews = pd.merge(reviews_duplication,reviews_dataset)
cleaned_reviews

Unnamed: 0,order_id,review_answer_timestamp,review_score
0,00010242fe8c5a6d1ba2dd792cb16214,22/09/2017 10:57,5
1,00018f77f2f0320c557190d7a144bdd3,15/05/2017 11:34,4
2,000229ec398224ef6ca0657da4fc703e,23/01/2018 16:06,5
3,00024acbcdf0a6daa1e931b038114c75,15/08/2018 16:39,4
4,00042b26cf59d7ce69dfabb4e55b4fd9,03/03/2017 10:54,5
...,...,...,...
89578,fffc94f6ce00a00581880bf54a75a037,14/05/2018 12:53,5
89579,fffcd46ef2263f404302a634eb57f7eb,25/07/2018 09:25,5
89580,fffce4705a9662cd70adb13d4a31832d,29/10/2017 21:33,5
89581,fffe18544ffabc95dfada21779c9644f,18/08/2017 12:24,5


In [9]:
cleaned_reviews.to_csv('/Users/hongrunqiao/desktop/minfinal_reviews_dataset.csv',index=False)