In [None]:
#pip install tqdm

In [5]:
import os
import pandas as pd
from datetime import datetime
import numpy as np
import nltk
import torch
from io import BytesIO

from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer

# Ensure that the Punkt Tokenizer Models are downloaded
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johnny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#download_dir = 'D:\\downloads'
download_dir = 'D:\\downloads\\amazon_customer_reviews'
print(download_dir)

D:\downloads\amazon_customer_reviews


In [7]:
download_files = os.listdir(download_dir)
print(download_files)

['4ee3300b-8d78-46ac-8abf-72edb1f4f5db', 'amazon_reviews_pickle_paragraphs.xlsx', 'amazon_reviews_pickle_paragraphs_2240129.pkl', 'amazon_reviews_pickle_sentences', 'amazon_reviews_pickle_sentences_2240129.pkl', 'chroma.sqlite3', 'Reviews.csv']


In [8]:

# Initialize an empty list to store the data
data = []

# Populate the list with file information
for file in download_files:
    filepath = os.path.join(download_dir, file)
    filename, file_extension = os.path.splitext(file)
    data.append({'filepaths': filepath, 'extensions': file_extension, 'filenames': file})

# Create DataFrame from the list
files_df = pd.DataFrame(data)

# Display the first few rows
print(files_df.head())

                                           filepaths extensions  \
0  D:\downloads\amazon_customer_reviews\4ee3300b-...              
1  D:\downloads\amazon_customer_reviews\amazon_re...      .xlsx   
2  D:\downloads\amazon_customer_reviews\amazon_re...       .pkl   
3  D:\downloads\amazon_customer_reviews\amazon_re...              
4  D:\downloads\amazon_customer_reviews\amazon_re...       .pkl   

                                      filenames  
0          4ee3300b-8d78-46ac-8abf-72edb1f4f5db  
1         amazon_reviews_pickle_paragraphs.xlsx  
2  amazon_reviews_pickle_paragraphs_2240129.pkl  
3               amazon_reviews_pickle_sentences  
4   amazon_reviews_pickle_sentences_2240129.pkl  


In [9]:
# Filter the DataFrame for PDF files
csv_files_df = files_df[files_df['extensions'] == '.csv']

# Create a list of file paths for PDF files
csv_filepaths = csv_files_df['filepaths'].tolist()

# Display the list
print(csv_filepaths)

['D:\\downloads\\amazon_customer_reviews\\Reviews.csv']


In [10]:
reviews_file_path = csv_filepaths[0]
print(reviews_file_path)

D:\downloads\amazon_customer_reviews\Reviews.csv


In [11]:
# Load the CSV file into a DataFrame
df_paragraph = pd.read_csv(reviews_file_path)
df_paragraph.rename(columns={'Text': 'Paragraph'}, inplace=True)
df_paragraph.shape

(568454, 10)

In [12]:
nan_count_paragraph = df_paragraph['Paragraph'].isna().sum()
nan_count_summary = df_paragraph['Summary'].isna().sum()

print(f"Number of NaNs in 'Paragraph': {nan_count_paragraph}")
print(f"Number of NaNs in 'Summary': {nan_count_summary}")


Number of NaNs in 'Paragraph': 0
Number of NaNs in 'Summary': 27


In [13]:
nan_counts = df_paragraph[['Paragraph', 'Summary']].isna().sum()

print(nan_counts)


Paragraph     0
Summary      27
dtype: int64


In [14]:

# Overwrite df_paragraph with rows where neither 'Paragraph' nor 'Summary' is NaN
df_paragraph = df_paragraph.dropna(subset=['Paragraph', 'Summary'])

print(f"After cleaning, df_paragraph rows: {len(df_paragraph)}")

# Display the first few rows of the DataFrame
print(df_paragraph.head())

After cleaning, df_paragraph rows: 568427
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                          Paragraph  
0  Good Quality Dog Food  I have bough

In [16]:
df_paragraph.shape

(568427, 10)

In [19]:
def split_paragraphs_into_sentences_old(df):
    # Create a list to store the new rows
    new_rows = []

    # Iterate through each row in the input DataFrame
    for index, row in df.iterrows():
        # Use nltk to split the paragraph into sentences
        sentences = nltk.tokenize.sent_tokenize(row['Paragraph'])

        # Add each sentence as a new row, keeping other columns the same
        for sentence_number, sentence in enumerate(sentences, start=1):
            new_row = row.to_dict()
            new_row['P_index'] = f'P_{index}'
            new_row['S_sentence_number'] = f'S_{sentence_number}'
            new_row['Sentence'] = sentence
            # Ensure 'Paragraph' column is not duplicated
            del new_row['Paragraph']
            new_rows.append(new_row)

    # Create a new DataFrame with the new rows
    df_sentence = pd.DataFrame(new_rows)

    return df_sentence

# Example usage:
# df_paragraph = pd.DataFrame({'Paragraph': ["Your paragraphs here."], ...other columns...})
# df_sentence = split_paragraphs_into_sentences(df_paragraph)

In [20]:
def split_paragraphs_into_sentences(df):
    new_rows = []

    # Wrap df.iterrows() with tqdm() for a progress bar
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Splitting paragraphs"):
        sentences = nltk.tokenize.sent_tokenize(row['Paragraph'])

        for sentence_number, sentence in enumerate(sentences, start=1):
            new_row = row.to_dict()
            new_row['P_index'] = f'P_{index}'
            new_row['S_sentence_number'] = f'S_{sentence_number}'
            new_row['Sentence'] = sentence
            del new_row['Paragraph']  # Avoid duplicating 'Paragraph' column
            new_rows.append(new_row)

    df_sentence = pd.DataFrame(new_rows)
    return df_sentence


In [21]:
df_sentence = split_paragraphs_into_sentences(df_paragraph)

Splitting paragraphs:   0%|          | 0/568427 [00:00<?, ?it/s]

In [22]:
df_sentence.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'P_index',
       'S_sentence_number', 'Sentence'],
      dtype='object')

In [23]:
df_sentence.iloc[0]['Sentence']

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality.'

In [24]:
# Initialize MinIO client and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

def generate_vectors(text):
    # Check if GPU is available and use it; otherwise, use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Send model to device (GPU or CPU)
    model.to(device)

    # Ensure no gradient calculations
    with torch.no_grad():
        # Prepare inputs and send them to the device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Forward pass, send model outputs back to CPU
        outputs = model(**inputs).last_hidden_state.mean(dim=1).to('cpu')

    return outputs.numpy()


In [25]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Initialize MinIO client and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def generate_vectors(text):
    # Check if GPU is available and use it; otherwise, use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Send model to device (GPU or CPU)
    model.to(device)
    model.eval()  # Ensure the model is in evaluation mode

    # Ensure no gradient calculations
    with torch.no_grad():
        # Prepare inputs and send them to the device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Forward pass, send model outputs back to CPU
        outputs = model(**inputs).last_hidden_state.mean(dim=1).squeeze().to('cpu').numpy()

    # Convert the output to float32 for compatibility and ensure it's flat
    return outputs.astype(np.float32)

# Example usage with a DataFrame
# df_sentence['Summary_vector'] = df_sentence['Summary'].progress_apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)


In [27]:
tqdm.pandas(desc="processing")


In [28]:
#df_sentence['Summary_vector'] = df_sentence['Summary'].apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)
df_sentence['Summary_vector'] = df_sentence['Summary'].progress_apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)

processing:   0%|          | 0/2832752 [00:00<?, ?it/s]

In [29]:
df_sentence['Sentence_vector'] = df_sentence['Sentence'].progress_apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)


processing:   0%|          | 0/2832752 [00:00<?, ?it/s]

In [43]:
df_pickle_filename = os.path.join(download_dir,"amazon_reviews_pickle_sentences_20240219.pkl")
df_sentence.to_pickle(df_pickle_filename)
print(df_pickle_filename)

D:\downloads\amazon_customer_reviews\amazon_reviews_pickle_sentences_20240219.pkl


In [31]:
df_sentence.sample(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,P_index,S_sentence_number,Sentence,Summary_vector,Sentence_vector
2185592,436940,B003ODBTBO,AYUCCFUVEEMAU,donimbo,2,2,5,1342569600,donimbo,P_436939,S_5,"As stated in S. Crocker's review, Keurig is a ...","[-0.58030456, 0.09047128, -0.30690676, -0.2801...","[-0.15631469, -0.29024586, -0.19089317, -0.177..."
2483613,497385,B001IZIC8I,A2Z2NNP4BCY8F8,"Kristina Pearson ""Photogrrl""",9,15,1,1261872000,Not What I'd Hoped For,P_497384,S_7,I have no complaints about the quality of the ...,"[0.373434, -0.2335425, -0.007979254, -0.015261...","[0.23785596, 0.11807627, 0.13398808, -0.124233..."
1000519,198645,B002AQL00G,AIXCATW18SQPD,"P. Goldberg ""perihope""",0,0,5,1350864000,Gluten Free and Good!,P_198644,S_3,I make sure the butter is extremely soft befor...,"[0.07756505, 0.047983423, 0.26800308, -0.21137...","[-0.22513467, -0.06171866, -0.3427693, 0.17870..."
107435,21798,B000KV61FC,A2G1LRD120SJPC,K. Hill,1,1,5,1305590400,Surprised at how much my dog liked this!,P_21797,S_6,"Yes, I kept both of them, just in case the oth...","[0.12298089, 0.12425774, 0.13720831, -0.082817...","[0.040124238, -0.105401315, 0.028834188, -0.24..."
1483589,294909,B005V9UG18,A3FHULPCEN9DID,T,0,1,5,1330128000,Wonderful product from a wonderful company,P_294908,S_4,I have bought these many times.,"[0.16828898, 0.06759964, 0.26373923, 0.1195057...","[0.52047026, 0.12971567, 0.16817996, -0.342357..."


In [32]:
import sys

# Assuming df is your DataFrame
size_in_bytes = sys.getsizeof(df_sentence)
size_in_mb = size_in_bytes / (1024**2)  # Convert to Megabytes
print("DataFrame size: {:.2f} MB".format(size_in_mb))


DataFrame size: 18868.40 MB


In [33]:
import gc
gc.collect()


0

In [34]:
import sys

# Create a list of tuples from the global symbol table to avoid RuntimeError
all_objects = [(name, sys.getsizeof(obj)) for name, obj in globals().items()]

# Sort the list by size
sorted_objects = sorted(all_objects, key=lambda x: x[1], reverse=True)

# Print the object names and their sizes
for name, size in sorted_objects:
    print(f"{name}: {size} bytes")


df_sentence: 19784953689 bytes
df_paragraph: 471976577 bytes
__: 34982 bytes
_31: 34982 bytes
files_df: 2159 bytes
tqdm: 2008 bytes
BertTokenizer: 2008 bytes
BertModel: 2008 bytes
_i25: 1252 bytes
_i19: 1123 bytes
_i17: 1119 bytes
_22: 1118 bytes
AutoModel: 1064 bytes
AutoTokenizer: 1064 bytes
_i20: 764 bytes
_i8: 484 bytes
_i5: 423 bytes
_i34: 422 bytes
datetime: 408 bytes
BytesIO: 408 bytes
_ih: 376 bytes
In: 376 bytes
_oh: 360 bytes
Out: 360 bytes
csv_files_df: 356 bytes
_i14: 339 bytes
_i28: 310 bytes
_i12: 289 bytes
_i9: 280 bytes
df_pickle_filename: 255 bytes
_ii: 252 bytes
_i32: 252 bytes
_i11: 215 bytes
_i24: 207 bytes
filepath: 195 bytes
reviews_file_path: 195 bytes
_i29: 185 bytes
_i30: 172 bytes
nan_counts: 162 bytes
download_dir: 159 bytes
___: 158 bytes
_23: 158 bytes
_i26: 158 bytes
_i6: 156 bytes
open: 136 bytes
split_paragraphs_into_sentences: 136 bytes
split_paragraphs_into_sentences_old: 136 bytes
generate_vectors: 136 bytes
_i13: 132 bytes
download_files: 120 bytes
d

In [35]:
for element in sorted_objects[0:4]:
    print(element)


('df_sentence', 19784953689)
('df_paragraph', 471976577)
('__', 34982)
('_31', 34982)


In [36]:
print(__)  
print(type(__))  


             Id   ProductId          UserId                   ProfileName  \
2185592  436940  B003ODBTBO   AYUCCFUVEEMAU                       donimbo   
2483613  497385  B001IZIC8I  A2Z2NNP4BCY8F8  Kristina Pearson "Photogrrl"   
1000519  198645  B002AQL00G   AIXCATW18SQPD        P. Goldberg "perihope"   
107435    21798  B000KV61FC  A2G1LRD120SJPC                       K. Hill   
1483589  294909  B005V9UG18  A3FHULPCEN9DID                             T   

         HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
2185592                     2                       2      5  1342569600   
2483613                     9                      15      1  1261872000   
1000519                     0                       0      5  1350864000   
107435                      1                       1      5  1305590400   
1483589                     0                       1      5  1330128000   

                                            Summary   P_index  \
2185592        

In [37]:
#del df_sentences


In [38]:
gc.collect()

0

In [39]:
df_paragraph.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Paragraph'],
      dtype='object')

In [40]:
df_paragraph['Paragraph_vector'] = df_paragraph['Paragraph'].progress_apply(lambda x: generate_vectors(x) if isinstance(x, str) else np.nan)

processing:   0%|          | 0/568427 [00:00<?, ?it/s]

In [45]:
df_pickle_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_20240219.pkl")
df_paragraph.to_pickle(df_pickle_filename)
print(df_pickle_filename)

D:\downloads\amazon_customer_reviews\amazon_reviews_pickle_paragraphs_20240219.pkl


In [46]:
df_xlsx_filename = os.path.join(download_dir,"amazon_reviews_pickle_paragraphs_20240219.xlsx")
df_without_vector = df_paragraph.drop('Paragraph_vector', axis=1)
#df_paragraph.to_excel(df_xlsx_filename, index=False)
df_without_vector.to_excel(df_xlsx_filename, index=False)

print(df_xlsx_filename)

D:\downloads\amazon_customer_reviews\amazon_reviews_pickle_paragraphs_20240219.xlsx


In [47]:
os.getcwd()

'D:\\github\\Johnny_Data606'