Dataset link: https://amazon-reviews-2023.github.io/

In [33]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm

# Use tqdm to track progress
tqdm.pandas()

file = 'data/Sports_and_Outdoors.jsonl'

def read_jsonl(file_path, nrows=None):
    data = []
    with open(file_path, 'r') as file:
        for i, line in enumerate(file):
            if nrows and i >= nrows:
                break
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Read a sample of the dataset
df = read_jsonl(file, nrows=100000)

# Select and rename relevant columns for better readability
relevant_columns = ['timestamp', 'rating', 'helpful_vote', 'title', 'text', 'asin', 'verified_purchase', 'user_id']
df = df[relevant_columns]

# Convert timestamp to readable date format
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# Display the DataFrame with enhanced readability
print(df.head())

# Pretty print the first review for better visualization
first_review = df.iloc[0].to_dict()
print("\nFirst review:")
pprint(first_review)


                timestamp  rating  helpful_vote               title  \
0 2023-02-25 10:30:53.520     5.0             8        Crazy comfy!   
1 2018-04-07 09:36:11.676     5.0             0          Excellent!   
2 2022-05-26 01:01:59.105     5.0             0    Best saddle pads   
3 2021-07-26 20:21:51.189     5.0             0  Perfect repair kit   
4 2021-04-07 21:43:31.976     5.0             0         Works great   

                                                text        asin  \
0  Not gonna lie- they are not much to look at. L...  B07F3BDT8T   
1                                 I love it. Pretty!  B00NXQLFQQ   
2  Huge fan of B Vertigo and this dressage pad do...  B08SVPR266   
3  I have a great Weaver halter. Recently, the Ch...  B00IET8S80   
4  This was great for a slightly too-short girth!...  B00DV0MKUY   

   verified_purchase                       user_id  
0               True  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  
1               True  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  
2    

In [4]:
import json
from pprint import pprint

file = 'data/meta_Sports_and_Outdoors.jsonl'
count = 0
limit = 1

with open(file, 'r') as fp:
    for line in fp:
        if count >= limit:
            break
        pprint(json.loads(line.strip()))
        count += 1


{'average_rating': 4.5,
 'bought_together': None,
 'categories': ['Sports & Outdoors',
                'Sports',
                'Skates, Skateboards & Scooters',
                'Skateboarding',
                'Skateboard Parts',
                'Wheels'],
 'description': ['All Zombie wheels are made in the USA. Zombie wheels feature '
                 'anodized aluminum hubs for maximum durability and precise '
                 'feel while maintaining rock solid stability. This allows our '
                 'unique urethane compounds to deliver all your power to the '
                 'floor. Choose the Zombie combination that fits your skating '
                 'style and surface. Zombie Aluminum Core – Designed in house '
                 'and manufactured using state of the art machining '
                 'technology. What makes this different from other aluminum '
                 'cores? The Zombie core is machined from a solid billet of '
                 'aluminum, using th

In [34]:
from langdetect import detect

# Language Detection
def detect_language(review):
    try:
        return detect(review)
    except:
        return 'unknown'

# Apply language detection
df['language'] = df['text'].progress_apply(detect_language)
df = df[df['language'] == 'en']

In [37]:
import nltk
from nltk.corpus import words
import re

# Gibberish Detection
nltk.download('words')
word_set = set(words.words())

def gibberish(review):
    words_in_review = re.findall(r'\b\w+\b', review.lower())
    if len(words_in_review) == 0:
        return True
    gibberish_score = sum(1 for word in words_in_review if word not in word_set) / len(words_in_review)
    return gibberish_score > 0.5

# Apply gibberish detection
df['gibberish'] = df['text'].progress_apply(gibberish)
df = df[df['gibberish'] == False]

[nltk_data] Downloading package words to
[nltk_data]     /home/wlodzimierrr/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [46]:
from better_profanity import profanity

# Profanity Detection using better-profanity
profanity.load_censor_words()

# Apply profanity detection
df['profanity'] = df['text'].progress_apply(lambda x: not profanity.contains_profanity(x))
df = df[df['profanity']]

# Drop intermediate columns used for preprocessing
df = df.drop(columns=['gibberish', 'profanity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['profanity'] = list(executor.map(check_profanity, df['text']))


In [48]:
text = df['text'][0]
censored_text = profanity.censor(text)
print(censored_text)

Not gonna lie- they are not much to look at. Lol. Luckily I’m one of those ppl that values things for function over looks & these function well so far. They are seriously one of the most comfortable pairs of socks I’ve owned in 5 decades.  I have not tried to wash them yet, so fingers crossed on that rn.  They feel very cushiony.  I wear them in my winter boots & just on my feet shoeless around my home.  I wish they came in more colors.  I’m one of those ppl that absolutely cannot stand toe seams on socks, but these have not bothered me at all.  I have super high arches so the only change I would make to the socks would be some compression there.  However, the socks fit perfectly as-is which really surprised me given my arches.  I just like having compression at my arches bc it feels good on them.  I wear a ladies 10-1/2 shoe- mens 8-1/2 and I bought the medium socks. They fit perfectly.  That’s never happened.  I had honestly expected to have to get them wet on my feet & let them shri

In [50]:
# Save the cleaned data
cleaned_file_path = 'data/Cleaned_Sports_and_Outdoors.csv'
df.to_csv(cleaned_file_path, index=False)

In [49]:
# Display the first few rows of the cleaned dataframe
print(df.head())

first_review = df.iloc[0].to_dict()
print("\nFirst review:")
pprint(first_review)

                timestamp  rating  helpful_vote  \
0 2023-02-25 10:30:53.520     5.0             8   
1 2018-04-07 09:36:11.676     5.0             0   
2 2022-05-26 01:01:59.105     5.0             0   
4 2021-04-07 21:43:31.976     5.0             0   
5 2021-02-10 14:02:26.894     5.0             0   

                               title  \
0                       Crazy comfy!   
1                         Excellent!   
2                   Best saddle pads   
4                        Works great   
5  Great stirrups with bar none grip   

                                                text        asin  \
0  Not gonna lie- they are not much to look at. L...  B07F3BDT8T   
1                                 I love it. Pretty!  B00NXQLFQQ   
2  Huge fan of B Vertigo and this dressage pad do...  B08SVPR266   
4  This was great for a slightly too-short girth!...  B00DV0MKUY   
5  I have to say, the grip on these are pretty gr...  B002HPNBMU   

   verified_purchase                       