# **Preliminary Data Analysis**

This script performs exploratory data analysis (EDA) on our sample of Amazon fashion item reviews (available [here]()):
* Rating distribution (overall and by sentiment)
* Review length
* Word clouds
* Topic modeling using BERTopic

## Preparations

In [2]:
# Import packages
import re
from zipfile import ZipFile
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from datasets import load_dataset
from langdetect import detect, DetectorFactory
from transformers import set_seed
from collections import Counter
import nltk
from wordcloud import WordCloud
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Download additional files for NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Set seeds for reproducible and consistent results
set_seed(42)

  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package punkt to /home/tom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [None]:
# Load sampled training data
with ZipFile('Data/checkpoint3.zip', 'r') as zip:
    with zip.open('checkpoint3.csv') as file:
        reviews_sample = pd.read_csv(file)
        file.close()

In [None]:
# Load wholde dataset
with ZipFile('Data/checkpoint2.zip', 'r') as zip:
    with zip.open('checkpoint2.csv') as file:
        reviews_all = pd.read_csv(file)
        file.close()

# Whole Dataset

### Rating distribuition

In [None]:
ratings = pd.DataFrame(reviews_all['rating'].value_counts())

plt.figure(figsize=(10, 6))
# Plot bar graph using seaborn
sns.barplot(x='rating', y='count', data=ratings)
# Set plot title and labels
plt.title('Rating Distribution', weight='bold')
plt.xlabel('Rating', weight='bold')
plt.ylabel('Number of Reviews', weight='bold')

#Save the plot
plt.savefig('Plots/rating_distribution_all.pdf',format='pdf', dpi=1500)

# Show the plot
plt.show()

### Sentiment distribution

In [None]:
# Replace 1 with 'POS' and 0 with 'NEG'
reviews_all['label'].replace({1: 'POS', 0: 'NEG'}, inplace=True)
category_number = reviews_all['label'].value_counts()
print(category_number)

category_percentages = reviews_all['label'].value_counts(normalize=True) * 100
print(category_percentages)

In [None]:
# Group by Review_Score and Sentiment
sentiment_distribution = reviews_all.groupby(['rating','sentiment']).size().reset_index(name='Count')
sentiment_distribution 

In [None]:
# Sentiment distributions
plt.figure(figsize=(10, 6))
# Plot bar graph using seaborn
sns.barplot(x='rating', y='Count', hue='sentiment', data=sentiment_distribution, palette=['red', 'green'])
# Set plot title and labels
plt.title('Sentiment Distribution by Rating', weight='bold')
plt.xlabel('Rating', weight='bold')
plt.ylabel('Number of Reviews', weight='bold')
plt.legend(title='Sentiment')

# Save the plot
plt.savefig('Plots/sentiment_dist_by_rating.pdf',format='pdf', dpi=1500)

# Show the plot
plt.show()

# Traning data

## Review length

### Distribution of Review Lengths

In [None]:
# Tokenize the reviews and determine their lengths
revs = reviews_sample['text'].copy()

doc_lengths = []

for rev in revs:
    tokens = nltk.word_tokenize(rev)
    doc_lengths.append(len(tokens))

doc_lengths = np.array(doc_lengths)

# Plot the distribution of review lengths (Figure A1 in the Appendix)
plt.figure(figsize=(10,8))
ax = sns.histplot(doc_lengths, binwidth=7, kde=True)
ax.lines[0].set_color('black')
plt.ylabel('Number of Reviews', weight='bold', fontsize=17)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.savefig('hist_rev_len.pdf', format='pdf', dpi=1500)
plt.tight_layout()
plt.show()

### Review Lengths (Total and by Sentiment)

In [None]:
# Calculations for text in Data chapter 
print('Average length of all reviews:' + str(np.average(doc_lengths)))
print('Standard deviation of the length of all reviews:' + str(np.std(doc_lengths)))

# Calculations for footnote in Data chapter
## Length of positive reviews
revs_pos = reviews_sample[reviews_sample['label'] == 1]['text'].copy()

doc_lengths_pos = []

for rev in revs_pos:
    tokens = nltk.word_tokenize(rev)
    doc_lengths_pos.append(len(tokens))

doc_lengths_pos = np.array(doc_lengths_pos)

## Length of negative reviews
revs_neg = reviews_sample[reviews_sample['label'] == 0]['text'].copy()

doc_lengths_neg = []

for rev in revs_neg:
    tokens = nltk.word_tokenize(rev)
    doc_lengths_neg.append(len(tokens))

doc_lengths_neg = np.array(doc_lengths_neg)

## Check average length and SD of length
print('\n')
print('Average length of positive reviews:' + str(np.average(doc_lengths_pos)))
print('Standard deviation of the length of positive reviews:' + str(np.std(doc_lengths_pos)))

print('\n')
print('Average length of negative reviews:' + str(np.average(doc_lengths_neg)))
print('Standard deviation of the length of negative reviews:' + str(np.std(doc_lengths_neg)))

## Rating Distribution

### Overall Rating Distribution

In [None]:
# Extract number of reviews by ratings
ratings = pd.DataFrame(reviews_sample['rating'].value_counts())

# Plot number of reviews by ratings
plt.figure(figsize=(10, 6))
sns_barplot = sns.barplot(x='rating', y='count', data=ratings)
for bar in sns_barplot.patches:
    bar.set_edgecolor('black')  # Set the border color
    bar.set_linewidth(1)        # Set the border thickness
#plt.title('Rating Distribution', weight='bold')
plt.xlabel('Rating', weight='bold', fontsize=17)
plt.ylabel('Number of Reviews', weight='bold', fontsize=17)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(top=34000)
plt.savefig('rating_distribution.pdf', format='pdf', dpi=1500)
plt.tight_layout()
plt.show()

### Rating Distribution by Sentiment

In [None]:
# Extract number of reviews by ratings and sentiment
value_counts = reviews_sample.groupby(['rating', 'label']).size().reset_index(name='counts')

# Plot number of reviews by ratings and senitment
plt.figure(figsize=(10, 6))
sns_barplot = sns.barplot(data=value_counts, x='rating', y='counts', hue='label', palette={0: 'red', 1: 'green'})
for bar in sns_barplot.patches:
    bar.set_edgecolor('black')  # Set the border color
    bar.set_linewidth(1)        # Set the border thickness
#plt.title('Rating Distribution', weight='bold')
plt.xlabel('Rating', weight='bold', fontsize=17)
plt.ylabel('Number of Reviews', weight='bold', fontsize=17)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(top=34000)
handles, labels = plt.gca().get_legend_handles_labels()
custom_labels = ['Negative', 'Positive']
plt.legend(title='Sentiment', handles=handles, labels=custom_labels, title_fontsize=16, fontsize=14)
plt.savefig('rating_distribution_sentiment.pdf',format='pdf', dpi=1500)
plt.tight_layout()
plt.show()

## Word Clouds

### TF-IDF

In [None]:
revs = reviews_sample['text'].copy()

# Create the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(revs)

# Get the feature names (the terms)
feature_names = vectorizer.get_feature_names_out()

# Convert the matrix to an array for easier viewing
tfidf_array = tfidf_matrix.toarray()

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_array)

# Print the feature names
print("Feature Names:")
print(feature_names)

### Word Cloud for all Reviews

In [None]:
# Sum the TF-IDF scores for each term across all documents
tfidf_scores = tfidf_matrix.sum(axis=0).A1  # .A1 flattens the matrix into an array

# Create a dictionary of words and their corresponding TF-IDF scores
tfidf_dict = dict(zip(feature_names, tfidf_scores))

# Generate the word cloud
wordcloud = WordCloud(width=4000, height=2000, background_color='white', colormap='viridis').generate_from_frequencies(tfidf_dict)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.savefig('wordcloud.pdf', format='pdf', dpi=1000)
plt.show()

### Word Clouds by Sentiment

In [None]:
# Function to generate a word cloud for a specific sentiment
def generate_wordcloud_for_sentiment(df, column, value):
    # Filter the DataFrame for the selected sentiment
    sentiment_data = df[df[column] == value]
    
    # Combine all documents in the filtered DataFrame into one text block
    documents = sentiment_data['text']
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
    
    # Fit and transform the documents
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Get the feature names (terms)
    feature_names = vectorizer.get_feature_names_out()
    
    # Sum the TF-IDF scores for each term
    tfidf_scores = tfidf_matrix.sum(axis=0).A1  # .A1 flattens the matrix
    
    # Create a dictionary mapping words to their corresponding TF-IDF scores
    tfidf_dict = dict(zip(feature_names, tfidf_scores))
    
    # Generate the word cloud
    wordcloud = WordCloud(width=4000, height=2000, background_color='black').generate_from_frequencies(tfidf_dict)
    
    # Display and save the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    if column == 'label':
        if value == 1:
            plt.title('Positive', weight='bold', fontsize=20)
        else:
            plt.title('Negative', weight='bold', fontsize=20)
    else:
        plt.title(f'Rating: {value}', weight='bold', fontsize=20)
    plt.axis('off')
    plt.tight_layout()
    if column == 'label':
        if value == 1:
            plt.savefig('wordcloud_pos.pdf', format='pdf', dpi=1000)
        else:
            plt.savefig('wordcloud_neg.pdf', format='pdf', dpi=1000)
    else:
        plt.savefig(f'wordcloud_rating{value}.pdf', format='pdf', dpi=1000)
    plt.show()

# Generate word clouds for positive and negative sentiments
generate_wordcloud_for_sentiment(reviews_sample, 'label', 1) # positive
generate_wordcloud_for_sentiment(reviews_sample, 'label', 0) # negative

### Word Clouds by Rating

In [None]:
# Generate word clouds for all ratings
generate_wordcloud_for_sentiment(reviews_sample, 'rating', 1)
generate_wordcloud_for_sentiment(reviews_sample, 'rating', 2)
generate_wordcloud_for_sentiment(reviews_sample, 'rating', 3)
generate_wordcloud_for_sentiment(reviews_sample, 'rating', 4)
generate_wordcloud_for_sentiment(reviews_sample, 'rating', 5)

## Topic Modelling with BERTopic

As the topic modelling requires GPUs to run the procedure is run on kaggle with the following code:

In [None]:
import os
import warnings

import spacy
import nltk
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
# Changed to CPU supported modules due to unavailability 
#from cuml.cluster import HDBSCAN
#from cuml.manifold import UMAP
from hdbscan import HDBSCAN
import umap
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoTokenizer
from transformers import pipeline, set_seed

warnings.filterwarnings("ignore", category=DeprecationWarning)      
set_seed(42)

class BertopicModel:

    def __init__(self, nr_topics):

        self.nr_topics = nr_topics
        # Prepare stopwords list
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer_model = CountVectorizer(ngram_range=(1, 1), stop_words=list(
            self.stop_words))  # max_df=0.90, min_df=0.005) #percentage threshold to remove words based on occurence in documents
        self.tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')

    def clean(self, df, column):
        df = df.loc[df[column].notnull(), :]
        documents = df[column].to_list()
        return df, documents

    def get_representation_model(self):

        prompt = """I have a topic described by the following keywords: [KEYWORDS] and  [Documents]

                    Based on the previous keywords, what is this topic about?"""

        # Create your representation model
        generator = pipeline('text2text-generation',
                                model='google/flan-t5-large')
        representation_model_text_generation = TextGeneration(
            generator, prompt=prompt)
        representation_model_keybert = KeyBERTInspired()
        representation_model_mmm = MaximalMarginalRelevance(diversity=0.25)
        representation_model = [representation_model_mmm, representation_model_keybert,
                                representation_model_text_generation]
        return representation_model

    def topic_model(self):
        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
        # embedding = tokenizer(documents.to_list(), padding=True, truncation=True, max_length=1024, return_tensors='pt')
        # Create instances of GPU-accelerated UMAP and HDBSCAN
        umap_model = umap.UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
        # Optimize UMAP parameters for memory usage
        #umap_model = umap.UMAP(n_components=2, n_neighbors=10, min_dist=0.0, low_memory=True)
        hdbscan_model = HDBSCAN(
            min_samples=10, gen_min_span_tree=True, prediction_data=True)

        topic_model = BERTopic("english",
                                embedding_model=self.tokenizer,
                                verbose=True,
                                nr_topics=self.nr_topics,  # check
                                top_n_words=25,
                                representation_model=self.get_representation_model(),
                                vectorizer_model=self.vectorizer_model,
                                ctfidf_model=ctfidf_model,
                                umap_model=umap_model,
                                hdbscan_model=hdbscan_model
                                )
        return topic_model


    def run(self, input_path, output_path1, output_path2, column, reduce_outliers=True,
        strategy="embeddings"):  # reduce_outliers #optional
        df = pd.read_csv(input_path)
        df, documents = self.clean(df=df, column=column)
        model = self.topic_model()
        # documents_list = documents.to_list()
        topics, probs = model.fit_transform(documents=documents)
        if reduce_outliers:
            # Reduce outliers using the embeddings strategy
            print("Running outlier reduction")
            reduced_topics = model.reduce_outliers(
                documents, topics, strategy=strategy)
            model.update_topics(
                documents, topics=reduced_topics, vectorizer_model=self.vectorizer_model)
        topic_info = model.get_topic_info()
        topic_info.to_csv(output_path1, index=False)  
        document_info = model.get_document_info(documents)
        document_info.to_csv(output_path2, index=False)


# define hyperparameters
nr_topics = 15
input_data = '/kaggle/input/checkpoint3/checkpoint3.csv'
output_path1 = "text_topics.csv"   
output_path2 = "text_clustered.csv" 
column = "text"
        

# run topic modelling
topic_model = BertopicModel(nr_topics=nr_topics)  
topic_model.run(input_path=input_data, output_path1=output_path1, output_path2=output_path2, column=column)

### TM Results

In [None]:
with ZipFile('Data/topic_models.zip', 'r') as zip:
        # Topic modelling with review content
        with zip.open('text_topics.csv') as file:
                text_topics = pd.read_csv(file) 
                file.close()
                
display(text_topics)        