# Cleaning Review data

This notebook processes review text to prepare text modality

In [74]:
import pandas as pd
import re

In [75]:
REVIEW_FILE_PATH = "../../data/review_item.csv"
TRAINING_FILE_PATH = "../../data/train_ratings_seen.csv"

OUTPUT_FILE_PATH = "../../data/text-modality.txt"

In [76]:
review_df = pd.read_csv(REVIEW_FILE_PATH)

review_df.head()

Unnamed: 0,review_text,item_id
0,,4026015
1,,2057563
2,,1374478
3,"\n""Smoky Tobacco \n""\n",1135067
4,"\n""Easy to drink Amarone.\nNot really bold for...",7103


In [77]:
def process_text(text: str):
    processed = text.strip()
    processed = re.sub('["]', '', processed)
    processed = re.sub('[\s]', ' ', processed)
    return processed.lower()

processed_df = review_df.dropna()
processed_df["processed_text"] = processed_df["review_text"].apply(lambda x: process_text(x))
processed_df = processed_df[processed_df["processed_text"] != ""]
processed_df = processed_df[~processed_df["processed_text"].str.contains("^[^A-Za-z0-9\s,.?!@#$%^&*]+$", na=False)]
processed_df = processed_df[["processed_text", "item_id"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [78]:
print("Before processing: ", len(review_df))
print("After processing: ", len(processed_df))

Before processing:  169342
After processing:  72454


In [79]:
processed_df

Unnamed: 0,processed_text,item_id
3,smoky tobacco,1135067
4,easy to drink amarone. not really bold for ama...,7103
6,chocolate. tobacco. cherry.,5927106
8,prune...,18739
11,light simple amarone.,77149
...,...,...
169317,not bad at all,6920887
169318,good,1456365
169319,ok,1910521
169320,good,1136728


In [80]:
training_df = pd.read_csv(TRAINING_FILE_PATH)

In [81]:
dic = {}

for _, r in training_df.iterrows():
    item_id = r["item_id"]
    dic[item_id] = "this is wine."

for _, r in processed_df.iterrows():
    item_id = r["item_id"]
    # Skip non-training items
    if item_id not in dic:
        continue
    dic[item_id] = dic[item_id] + " " + r["processed_text"]

In [82]:
with open(OUTPUT_FILE_PATH, "w") as f:
    for item_id, review in dic.items():
        f.write(f"{review.strip()}::{int(item_id)}\n")

In [83]:
print("Sanity check...")
print("Number of unique items: ", len(training_df["item_id"].unique()))
print("Number of item reviews: ", len(dic))

Sanity check...
Number of unique items:  39520
Number of item reviews:  39520
