Dataset link: https://amazon-reviews-2023.github.io/

In [None]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm

# Use tqdm to track progress
tqdm.pandas()

file = 'data/Sports_and_Outdoors.jsonl'

def read_jsonl(file_path, nrows=None):
    data = []
    with open(file_path, 'r') as file:
        for i, line in enumerate(file):
            if nrows and i >= nrows:
                break
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Read a sample of the dataset
df = read_jsonl(file, nrows=100000)

# Select and rename relevant columns for better readability
relevant_columns = ['timestamp', 'rating', 'helpful_vote', 'title', 'text', 'asin', 'verified_purchase', 'user_id']
df = df[relevant_columns]

# Convert timestamp to readable date format
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# Display the DataFrame with enhanced readability
print(df.head())

# Pretty print the first review for better visualization
first_review = df.iloc[0].to_dict()
print("\nFirst review:")
pprint(first_review)


In [None]:
from langdetect import detect

# Language Detection
def detect_language(review):
    try:
        return detect(review)
    except:
        return 'unknown'

# Apply language detection
df['language'] = df['text'].progress_apply(detect_language)
df = df[df['language'] == 'en']

In [None]:
import nltk
from nltk.corpus import words
import re

# Gibberish Detection
nltk.download('words')
word_set = set(words.words())

def gibberish(review):
    words_in_review = re.findall(r'\b\w+\b', review.lower())
    if len(words_in_review) == 0:
        return True
    gibberish_score = sum(1 for word in words_in_review if word not in word_set) / len(words_in_review)
    return gibberish_score > 0.5

# Apply gibberish detection
df['gibberish'] = df['text'].progress_apply(gibberish)
df = df[df['gibberish'] == False]

In [None]:
from better_profanity import profanity

# Profanity Detection using better-profanity
profanity.load_censor_words()

# Apply profanity detection
df['profanity'] = df['text'].progress_apply(lambda x: not profanity.contains_profanity(x))
df = df[df['profanity']]

# Drop intermediate columns used for preprocessing
df = df.drop(columns=['gibberish', 'profanity'])

In [None]:
text = df['text'][0]
censored_text = profanity.censor(text)
print(censored_text)

In [None]:
# Save the cleaned data
cleaned_file_path = 'data/Cleaned_Sports_and_Outdoors.csv'
df.to_csv(cleaned_file_path, index=False)

In [None]:
# Display the first few rows of the cleaned dataframe
print(df.head())

first_review = df.iloc[0].to_dict()
print("\nFirst review:")
pprint(first_review)