In [48]:
import pandas as pd 
import nltk
# Download necessary nltk data
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\tulio/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [49]:
df = pd.read_csv('../../datasets/amazon_reviews_sample.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,score,review
0,0,1,Stuning even for the non-gamer: This sound tr...
1,1,1,The best soundtrack ever to anything.: I'm re...
2,2,1,Amazing!: This soundtrack is my favorite musi...
3,3,1,Excellent Soundtrack: I truly like this sound...
4,4,1,"Remember, Pull Your Jaw Off The Floor After H..."


In [50]:
df.shape

(10000, 3)

In [51]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Instantiate the analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get the sentiment score of each review
def get_sentiment_score(review):
    return analyzer.polarity_scores(review)['compound']

# Apply the function to the 'review' column of the dataframe to get the sentiment scores
df['sentiment_score'] = df['review'].apply(get_sentiment_score)

In [52]:
df[['sentiment_score','review']].head()

Unnamed: 0,sentiment_score,review
0,0.9454,Stuning even for the non-gamer: This sound tr...
1,0.8957,The best soundtrack ever to anything.: I'm re...
2,0.9858,Amazing!: This soundtrack is my favorite musi...
3,0.9814,Excellent Soundtrack: I truly like this sound...
4,0.9781,"Remember, Pull Your Jaw Off The Floor After H..."


In [53]:
# Define a function to label the sentiment
def get_sentiment_label(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Apply the function to the 'scaled_score' column of the dataframe to get the sentiment labels
df['sentiment_label'] = df['sentiment_score'].apply(get_sentiment_label)
df[df['sentiment_label'] == 'Neutral'][['sentiment_label','sentiment_score', 'review']]


Unnamed: 0,sentiment_label,sentiment_score,review
99,Neutral,0.0,"Caution!: These tracks are not the ""original""..."
176,Neutral,0.0,small didn't work: It is very small compared ...
306,Neutral,0.0,Tha One: this c.d. is off the hook. it blew t...
578,Neutral,0.0,Squeem: I ordered according to the size chart...
595,Neutral,0.0,"Pass: The Sizes run really small, It flips up..."
...,...,...,...
9793,Neutral,0.0,Incisive and indicting: A modern classic that...
9833,Neutral,0.0,Materpiece that needs an overhall: faded colo...
9934,Neutral,0.0,Desucked: If it wasnt for the suspense and th...
9967,Neutral,0.0,REVIEW - Borror & DeLong: This is a keystone ...


In [54]:
import plotly.graph_objs as go
# Group the reviews by sentiment label
sentiment_counts = df.groupby('sentiment_label').size().reset_index(name='count')

# Create a bar chart to visualize the distribution of sentiment labels
fig = go.Figure(data=go.Bar(x=sentiment_counts['sentiment_label'], y=sentiment_counts['count']))

# Update the layout of the chart
fig.update_layout(
    title='Sentiment Distribution',
    xaxis_title='Sentiment Label',
    yaxis_title='Number of Reviews'
)
# Display the chart
fig.show()

In [55]:
from nltk import word_tokenize, ngrams, FreqDist
from nltk.corpus import stopwords
import string

# Tokenize the text
tokenized_reviews = df['review'].apply(word_tokenize)

# Remove punctuation and convert to lowercase
punctuations = string.punctuation
reviews_no_punct = [[word.lower() for word in review if word not in punctuations] for review in tokenized_reviews]

# Remove stop words
stop_words = set(stopwords.words('english'))
reviews_no_stop = [[word for word in review if word not in stop_words] for review in reviews_no_punct]

# Generate unigrams
unigrams = [word for review in reviews_no_stop for word in review]

# Generate bigrams
bigrams = [bigram for review in reviews_no_stop for bigram in ngrams(review, 2)]

# Calculate frequency distribution of unigrams and bigrams
unigram_freq = FreqDist(unigrams)
bigram_freq = FreqDist(bigrams)

# Get the top 20 most common unigrams
top_unigrams = unigram_freq.most_common(20)

# Get the top 20 most common unigrams
top_bigrams = bigram_freq.most_common(20)


In [56]:
import plotly.graph_objects as go

# Create the Plotly bar chart
fig = go.Figure(data=[go.Bar(x=[word for word, count in top_unigrams], y=[count for word, count in top_unigrams])])
fig.update_layout(title='Top 20 most common unigrams', xaxis_title='Unigram', yaxis_title='Count')

# Display the chart
fig.show()


In [57]:
# Create the Plotly bar chart
fig = go.Figure(data=[go.Bar(x=[f"{word1} {word2}" for (word1, word2), count in top_bigrams], y=[count for (word1, word2), count in top_bigrams])])
fig.update_layout(title='Top 20 most common bigrams', xaxis_title='Bigram', yaxis_title='Count')

# Display the chart
fig.show()

In [58]:
# Generate trigrams
trigrams = [gram for review in reviews_no_stop for gram in ngrams(review, 3)]

# Calculate frequency distribution of trigrams
trigram_freq = FreqDist(trigrams)

# Get the top 20 most common trigrams
top_trigrams = trigram_freq.most_common(20)

# Create the Plotly bar chart
fig = go.Figure(data=[go.Bar(x=[f"{word1} {word2} {word3}" for (word1, word2, word3), count in top_trigrams], y=[count for (word1, word2, word3), count in top_trigrams])])
fig.update_layout(title='Top 20 most common trigrams', xaxis_title='Trigram', yaxis_title='Count')

# Display the chart
fig.show()

In [59]:
# Create separate lists for each sentiment class
positive_reviews = [review for i, review in enumerate(reviews_no_stop) if df['score'][i] == 1]
negative_reviews = [review for i, review in enumerate(reviews_no_stop) if df['score'][i] == 0]

# Generate trigrams for each sentiment class
positive_trigrams = [gram for review in positive_reviews for gram in ngrams(review, 3)]
negative_trigrams = [gram for review in negative_reviews for gram in ngrams(review, 3)]

# Calculate frequency distribution of trigrams for each sentiment class
positive_trigram_freq = FreqDist(positive_trigrams)
negative_trigram_freq = FreqDist(negative_trigrams)

# Get the top 20 most common trigrams for each sentiment class
top_positive_trigrams = positive_trigram_freq.most_common(20)
top_negative_trigrams = negative_trigram_freq.most_common(20)

from itertools import chain
max_count = max(chain(positive_trigram_freq.values(), negative_trigram_freq.values()))
max_count


91

In [60]:
from plotly.subplots import make_subplots
# Create the Plotly subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Top 20 trigrams for positive reviews', 'Top 20 trigrams for negative reviews'))

# Add the bar charts to the subplots
fig.add_trace(go.Bar(x=[f"{word1} {word2} {word3}" for (word1, word2, word3), count in top_positive_trigrams], y=[count for (word1, word2, word3), count in top_positive_trigrams], name='Positive'), row=1, col=1)
fig.add_trace(go.Bar(x=[f"{word1} {word2} {word3}" for (word1, word2, word3), count in top_negative_trigrams], y=[count for (word1, word2, word3), count in top_negative_trigrams], name='Negative'), row=1, col=2)

# Set the maximum y-axis value for both subplots
fig.update_yaxes(range=[0, max_count])

# Update the layout and display the chart
fig.update_layout(title='Top 20 trigrams for each sentiment class', height=800, width=1200)
fig.show()