In [1]:
import os
import pandas as pd
import kagglehub

from kaggle_data_loader import KaggleDataLoader
from data_generator import DataGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

kaggle_path = "kritanjalijain/amazon-reviews"
data_column_names = [DataGenerator.POLARITY_COLUMN_NAME, 
                     DataGenerator.TITLE_COLUMN_NAME,
                     DataGenerator.REVIEW_COLUMN_NAME]

kaggle_data_loader = KaggleDataLoader(kaggle_path)
path = kaggle_data_loader.load_data(data_column_names)

print("Path to dataset files:", path)
print(path)
print(os.listdir(path))

train_df = kaggle_data_loader.get_train_df()
test_df = kaggle_data_loader.get_test_df()

print(train_df.head())

train_texts = train_df[DataGenerator.REVIEW_COLUMN_NAME].tolist()
test_texts = test_df[DataGenerator.REVIEW_COLUMN_NAME].tolist()

train_labels = (train_df[DataGenerator.POLARITY_COLUMN_NAME] - 1).tolist() 
test_labels = (test_df[DataGenerator.POLARITY_COLUMN_NAME] - 1).tolist()

print("Train shape:", train_df.shape)
print("Test shape: ", test_df.shape)

Path to dataset files: C:\Users\hOm3b\.cache\kagglehub\datasets\kritanjalijain\amazon-reviews\versions\2
C:\Users\hOm3b\.cache\kagglehub\datasets\kritanjalijain\amazon-reviews\versions\2
['amazon_review_polarity_csv.tgz', 'test.csv', 'train.csv']
Path to dataset files: C:\Users\hOm3b\.cache\kagglehub\datasets\kritanjalijain\amazon-reviews\versions\2
C:\Users\hOm3b\.cache\kagglehub\datasets\kritanjalijain\amazon-reviews\versions\2
['amazon_review_polarity_csv.tgz', 'test.csv', 'train.csv']
   Label                                              Title  \
0      2                     Stuning even for the non-gamer   
1      2              The best soundtrack ever to anything.   
2      2                                           Amazing!   
3      2                               Excellent Soundtrack   
4      2  Remember, Pull Your Jaw Off The Floor After He...   

                                              Review  
0  This sound track was beautiful! It paints the ...  
1  I'm reading a 

In [3]:
print("Unique label counts")
print(train_df[DataGenerator.POLARITY_COLUMN_NAME].value_counts())
print(test_df[DataGenerator.POLARITY_COLUMN_NAME].value_counts())

Unique label counts
Label
2    1800000
1    1800000
Name: count, dtype: int64
Label
2    200000
1    200000
Name: count, dtype: int64


In [4]:
train_review_lengths = train_df[DataGenerator.REVIEW_COLUMN_NAME].str.len()
print("Train dataset, Review column's length:\n")
reformatted_desc = train_review_lengths.describe().round(2)

# Convert from scientific notation to 2 decimal only
print(reformatted_desc.apply(lambda x: f"{x:.2f}"))

Train dataset, Review column's length:

count    3600000.00
mean         405.14
std          234.27
min            4.00
25%          207.00
50%          356.00
75%          566.00
max         1010.00
Name: Review, dtype: object


In [5]:
test_review_lengths = test_df[DataGenerator.REVIEW_COLUMN_NAME].str.len()
print("Test dataset, Review column's length:\n")
reformatted_desc = test_review_lengths.describe().round(2)

# Convert from scientific notation to 2 decimal only
print(reformatted_desc.apply(lambda x: f"{x:.2f}"))

Test dataset, Review column's length:

count    400000.00
mean        404.90
std         234.11
min          15.00
25%         207.00
50%         356.00
75%         565.00
max        1009.00
Name: Review, dtype: object


In [None]:
###
### This may take a bit of time to run
###

from itertools import chain

def get_unique_words(series):
    # To lowercase and split into words, handling NaN
    words = (review.lower().split() for review in series if isinstance(review, str))
    # Flatten and convert to set
    return set(chain(*words))


# Apply to 'review' column
unique_words = get_unique_words(train_df[DataGenerator.REVIEW_COLUMN_NAME])
num_unique_words = len(unique_words)

print(f"Number of unique words in all the reviews: {num_unique_words}")