# Categorizando reviews

In [1]:
# Imports

import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import unicodedata

In [2]:
def load_data(file_path):
    return pd.read_parquet(file_path)

file_path = './olist_order_reviews_dataset.parquet'
df = load_data(file_path)

def skimming_data(data):
    skimmed_data = pd.DataFrame({
        'feature': data.columns.values,
        'data_type': data.dtypes.values,
        'null_value(%)': data.isna().mean().values * 100,
        'neg_value(%)': [len(data[col][data[col] < 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],
        '0_value(%)': [len(data[col][data[col] == 0]) / len(data) * 100 if col in data.select_dtypes(include=[np.number]).columns else 0 for col in data.columns],
        'duplicate': data.duplicated().sum(),
        'n_unique': data.nunique().values,
    })

    return skimmed_data.round(3)

def remove_nulls_from_column(df, column_name):
    if column_name not in df.columns:
        raise ValueError(f"A coluna '{column_name}' não existe no DataFrame.")
    df_cleaned = df.dropna(subset=[column_name])

    return df_cleaned

In [3]:
#Main
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
df_resumo = skimming_data(df)
df_limpo = df.drop(columns=['review_comment_title'])
df_limpo = remove_nulls_from_column(df_limpo,'review_comment_message')

display(df_limpo)
display(skimming_data(df_limpo))

Unnamed: 0,review_id,order_id,review_score,review_comment_message,review_creation_date,review_answer_timestamp
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47
12,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",2018-02-16 00:00:00,2018-02-20 10:52:22
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01
...,...,...,...,...,...,...
99205,98fffa80dc9acbde7388bef1600f3b15,d398e9c82363c12527f71801bf0e6100,4,para este produto recebi de acordo com a compr...,2017-11-29 00:00:00,2017-11-30 15:52:51
99208,df5fae90e85354241d5d64a8955b2b09,509b86c65fe4e2ad5b96408cfef9755e,5,Entregou dentro do prazo. O produto chegou em ...,2018-02-07 00:00:00,2018-02-19 19:47:23
99215,a709d176f59bc3af77f4149c96bae357,d5cb12269711bd1eaf7eed8fd32a7c95,3,"O produto não foi enviado com NF, não existe v...",2018-05-19 00:00:00,2018-05-20 21:51:06
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,"Excelente mochila, entrega super rápida. Super...",2018-03-22 00:00:00,2018-03-23 09:10:43


Unnamed: 0,feature,data_type,null_value(%),neg_value(%),0_value(%),duplicate,n_unique
0,review_id,object,0.0,0.0,0.0,0,40668
1,order_id,object,0.0,0.0,0.0,0,40836
2,review_score,int64,0.0,0.0,0.0,0,5
3,review_comment_message,object,0.0,0.0,0.0,0,36159
4,review_creation_date,object,0.0,0.0,0.0,0,622
5,review_answer_timestamp,object,0.0,0.0,0.0,0,40642


In [4]:

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[.,;:]', '', text)
  text = ''.join([c for c in text if not unicodedata.category(c).startswith('P')])
  return text

df_limpo['review_comment_message'] = df_limpo['review_comment_message'].apply(preprocess_text)
display(df_limpo)


Unnamed: 0,review_id,order_id,review_score,review_comment_message,review_creation_date,review_answer_timestamp
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,recebi bem antes do prazo estipulado,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,parabéns lojas lannister adorei comprar pela i...,2018-03-01 00:00:00,2018-03-02 10:26:53
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,aparelho eficiente no site a marca do aparelho...,2018-05-22 00:00:00,2018-05-23 16:45:47
12,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,mas um pouco travandopelo valor ta boa\r\n,2018-02-16 00:00:00,2018-02-20 10:52:22
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,vendedor confiável produto ok e entrega antes ...,2018-05-23 00:00:00,2018-05-24 03:00:01
...,...,...,...,...,...,...
99205,98fffa80dc9acbde7388bef1600f3b15,d398e9c82363c12527f71801bf0e6100,4,para este produto recebi de acordo com a compr...,2017-11-29 00:00:00,2017-11-30 15:52:51
99208,df5fae90e85354241d5d64a8955b2b09,509b86c65fe4e2ad5b96408cfef9755e,5,entregou dentro do prazo o produto chegou em c...,2018-02-07 00:00:00,2018-02-19 19:47:23
99215,a709d176f59bc3af77f4149c96bae357,d5cb12269711bd1eaf7eed8fd32a7c95,3,o produto não foi enviado com nf não existe ve...,2018-05-19 00:00:00,2018-05-20 21:51:06
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,excelente mochila entrega super rápida super r...,2018-03-22 00:00:00,2018-03-23 09:10:43


