https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/data


In [None]:
import kagglehub

In [None]:
# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/amazon-fine-food-reviews


In [None]:
!ls -alh {path}

total 643M
drwxr-sr-x 2 1000 1000    0 May 18 09:43 .
drwxr-xr-x 3 root root 4.0K May 18 10:04 ..
-rw-r--r-- 1 1000 1000 356M May 18 09:43 database.sqlite
-rw-r--r-- 1 1000 1000  277 May 18 09:42 hashes.txt
-rw-r--r-- 1 1000 1000 287M May 18 09:43 Reviews.csv


In [None]:
import pandas as pd
import numpy as np
import re
import string
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv(path + "/Reviews.csv")  # Use the correct path
df = df.head(100000)  # Take only the first 100,000 rows
print(df.head())  # Display the first few rows

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [None]:
df['Text'].head()

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...


In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # 4. Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text

In [None]:
df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [None]:
df[['Text', 'Cleaned_Text']].head()

Unnamed: 0,Text,Cleaned_Text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


# Lemmatization

In [None]:
pip install nltk



In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # For WordNet lemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Example: Apply on a single sentence
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
!pip install -qq spacy
!python -m spacy download en_core_web_sm

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.[0m[31m
[0mCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize_with_spacy(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

df['lemmatized_text'] = df['Cleaned_Text'].apply(lemmatize_with_spacy)

In [None]:
df[['Text', 'Cleaned_Text','lemmatized_text']].head()

Unnamed: 0,Text,Cleaned_Text,lemmatized_text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,buy several vitality can dog food product find...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,product arrive label jumbo salt peanutsthe pea...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...,confection around century light pillowy citrus...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,look secret ingredient robitussin believe find...
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,great taffy great price wide assortment yummy ...


In [None]:
df[['Text', 'Cleaned_Text', 'lemmatized_text']].to_csv('processed_text.csv')


In [None]:
from google.colab import files
!zip processed_text.zip processed_text.csv
files.download('processed_text.zip')


updating: processed_text.csv (deflated 72%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Bag of Words


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
import pickle
import os
import gc

# checking if all the necessary data is there
print("Step 1: Verifying data...")
if 'df' not in locals() or 'lemmatized_text' not in df.columns:
    print("Error: DataFrame with lemmatized text not found.")
    print("Please run the preprocessing steps first.")
    # exit
else:
    print(f"Found DataFrame with {len(df)} rows.")

# creating bag of words
print("\nStep 2: Creating Bag of Words representation...")
count_vectorizer = CountVectorizer(
    max_features=10000,  # taking top 10000 words
    min_df=5,            # ignoring terms in lesser than 5 documents
    max_df=0.8           # ignoring terms in more than 80% of documents
)

# processing the entire dataset
print("Processing all documents...")
bow_matrix = count_vectorizer.fit_transform(df['lemmatized_text'])

# vocabulary information
feature_names = count_vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(feature_names)} unique words")
print(f"Bag of Words matrix shape: {bow_matrix.shape}")
print(f"Sparsity: {100 * (1 - bow_matrix.nnz / (bow_matrix.shape[0] * bow_matrix.shape[1])):.2f}%")

# Verification
print(f"Documents in dataset: {len(df)}")
print(f"Documents in BoW matrix: {bow_matrix.shape[0]}")
assert len(df) == bow_matrix.shape[0], "Not all documents were processed!"

# Saving results
print("\nStep 3: Saving results...")
sparse.save_npz('bow_matrix_100k.npz', bow_matrix)
with open('count_vectorizer_100k.pkl', 'wb') as f:
    pickle.dump(count_vectorizer, f)

#  word frequency related information
word_counts = bow_matrix.sum(axis=0).A1
word_freq = pd.DataFrame({'word': feature_names, 'count': word_counts})
word_freq = word_freq.sort_values('count', ascending=False).reset_index(drop=True)
word_freq.to_csv('word_frequencies_100k.csv', index=False)

#  top words
print("\nTop 20 most common words:")
print(word_freq.head(20))

# Create a small sample DataFrame  (first 5 rows, all columns)
print("\nStep 4: Creating sample visualization...")
sample_size = min(5, bow_matrix.shape[0])
sample_df = pd.DataFrame(
    bow_matrix[:sample_size].toarray(),
    columns=feature_names
)

#  small portion of the sample
print("\nSample of Bag of Words (first 5 rows, first 10 columns):")
print(sample_df.iloc[:, :10])

# bag of words indicator to original DataFrame
df['has_bow'] = True

print("\nBag of Words processing complete!")
print("Files saved:")
print("- bow_matrix_100k.npz: Sparse matrix with full bag of words")
print("- count_vectorizer_100k.pkl: The vectorizer model with vocabulary")
print("- word_frequencies_100k.csv: Word frequency counts")

# Clean up memory
del sample_df
gc.collect()

Step 1: Verifying data...
Found DataFrame with 100000 rows.

Step 2: Creating Bag of Words representation...
Processing all documents...
Vocabulary size: 10000 unique words
Bag of Words matrix shape: (100000, 10000)
Sparsity: 99.68%
Documents in dataset: 100000
Documents in BoW matrix: 100000

Step 3: Saving results...

Top 20 most common words:
       word  count
0       not  57634
1      like  50986
2      good  44615
3     taste  44436
4    flavor  35953
5       one  32819
6       get  32368
7      love  31569
8        do  31163
9   product  30977
10     make  30766
11      try  29206
12      use  29065
13    great  28675
14     well  28363
15      tea  26479
16      buy  25886
17     food  25496
18   coffee  25337
19    would  24610

Step 4: Creating sample visualization...

Sample of Bag of Words (first 5 rows, first 10 columns):
   aa  aafco  ab  aback  abandon  abc  abdominal  ability  abit  able
0   0      0   0      0        0    0          0        0     0     0
1   0      0 

204

In [None]:
df.to_csv('preprocessed_reviews_100k.csv', index=False)

#Amazon Fine Food Reviews - Bag of Words Processing
##Files Created:

### **bow_matrix_100k.npz**: Sparse matrix with bag of words representation (100,000 documents × 10,000 features)
###**count_vectorizer_100k.pkl**: Vectorizer model with vocabulary mapping
###**word_frequencies_100k.csv**: Word frequency counts for all terms

##Processing Summary:





* Successfully processed all 100,000 reviews
* Created vocabulary of 10,000 most relevant terms
* Matrix sparsity: 99.68% (memory-efficient)
* Top words: "not", "like", "good", "taste", "flavor"
* All documents from original dataset included

##Loading for Future Use:

    from scipy import sparse

    import pickle


    bow_matrix = sparse.load_npz('bow_matrix_100k.npz')


    with open('count_vectorizer_100k.pkl', 'rb') as f:

          count_vectorizer = pickle.load(f)
    

    feature_names = count_vectorizer.get_feature_names_out()

In [None]:
!pip install gensim

Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [None]:
# Reset packages
!pip uninstall -y numpy
!pip uninstall -y gensim

# Reinstall in correct order
!pip install numpy
!pip install gensim

# Now try importing again
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Continue with the code after successful import

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.6
Coll

In [None]:

# Step 1: Load your data first
import pandas as pd
import numpy as np
from scipy import sparse
import pickle

# Load your previously saved data
# If you saved the preprocessed DataFrame:
df = pd.read_csv('preprocessed_reviews_100k.csv')

# If you didn't save the DataFrame, you need to recreate it from the original source
# For example:
# df = pd.read_csv(path + "/Reviews.csv")
# df = df.head(100000)  # Take first 100,000 rows

# Check that lemmatized_text column exists
print(df.columns)
print(df['lemmatized_text'].head())

# Step 2: Now run your LDA code
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Tokenize the lemmatized text
tokenized_texts = [text.split() for text in df['lemmatized_text']]

# Create dictionary
dictionary = Dictionary(tokenized_texts)

# Filter extremes to remove very rare and very common words
dictionary.filter_extremes(no_below=5, no_above=0.7)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

# Train the LDA model
lda_model = LdaModel(corpus=corpus, num_topics=50, id2word=dictionary, passes=10, random_state=42)

# Save the dictionary, corpus, and model
dictionary.save('lda_dictionary.dict')
with open('lda_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)
lda_model.save('lda_model_50topics')

# Print the topics
print("Top words in topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic #{idx}: {topic}")

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'Cleaned_Text', 'lemmatized_text', 'has_bow'],
      dtype='object')
0    buy several vitality can dog food product find...
1    product arrive label jumbo salt peanutsthe pea...
2    confection around century light pillowy citrus...
3    look secret ingredient robitussin believe find...
4    great taffy great price wide assortment yummy ...
Name: lemmatized_text, dtype: object
Top words in topics:
Topic #0: 0.314*"box" + 0.033*"gum" + 0.023*"picture" + 0.022*"service" + 0.022*"packaging" + 0.021*"pack" + 0.021*"customer" + 0.020*"send" + 0.018*"show" + 0.017*"directly"
Topic #1: 0.126*"butter" + 0.117*"peanut" + 0.067*"pie" + 0.052*"bread" + 0.040*"bake" + 0.037*"flour" + 0.035*"grill" + 0.029*"roll" + 0.021*"slice" + 0.020*"sandwich"
Topic #2: 0.124*"mix" + 0.099*"make" + 0.056*"ice" + 0.048*"use" + 0.046*"recipe" + 0.045*"cream" + 0.0