###### In this notebook we will be doing some sentimental analysis in python using      two different techniques:

##  1.VADER(Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach
##  2.Roberta Pretrained Model from 🤗
##  3.Huggingface Pipeline


# Step 1 :Read in Data and NLTK Basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
nltk.data.path.append("C:/nltk_data")
nltk.download('averaged_perceptron_tagger', download_dir="C:/nltk_data")
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

In [None]:
# Read in data
df = pd.read_csv('amazon.csv')
print(df.shape)
df = df.head(500)
print(df.shape)

In [None]:
df.head()

## Quick EDA

In [None]:
ax = df['overall'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

## Basic NLTK

In [None]:
example = df['reviewText'][50]
print(example)

In [None]:
from nltk.tokenize import TreebankWordTokenizer

word_tokenizer = TreebankWordTokenizer()
word_tokens = word_tokenizer.tokenize(example)
print(word_tokens[:10])


In [None]:
import nltk

tagged = nltk.pos_tag(word_tokens)
print(tagged[:10])  # View first 10 tagged tokens


In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

# Step 2. VADER Seniment Scoring
### We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.

#### This uses a "bag of words" approach:
####    1.Stop words are removed
####    2.each word is scored and combined to a total score.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores(example)

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = str(row['reviewText'])
    res[i] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.merge(df, left_index=True, right_index=True)

In [None]:
# Now we have sentiment score and metadata
vaders.head()

# Plot VADER results

In [None]:
ax = sns.barplot(data=vaders, x='overall', y='compound')
ax.set_title('Compound Score by Amazon Star Rating')
plt.xlabel('Amazon Star Rating')
plt.ylabel('Compound Sentiment Score')
plt.show()


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))

sns.barplot(data=vaders, x='overall', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='overall', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='overall', y='neg', ax=axs[2])

axs[0].set_title('Positive Score by Rating')
axs[1].set_title('Neutral Score by Rating')
axs[2].set_title('Negative Score by Rating')

plt.tight_layout()
plt.show()


# Step 3. Roberta Pretrained Model
### Use a model trained of a large corpus of data.
### Transformer model accounts for the words but also the context related to other words.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# VADER results on example
example = df['reviewText'][50]
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = str(row['reviewText'])  # fixed column name
        myid = i  # using row index as ID

        # VADER
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {f"vader_{k}": v for k, v in vader_result.items()}

        # RoBERTa
        roberta_result = polarity_scores_roberta(text)

        # Merge both
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both

    except RuntimeError:
        print(f'⚠️ Skipped long or problematic review at index {i}')


In [None]:
results_df = pd.DataFrame(res).T

# Combine with df using the index
final_df = pd.concat([df, results_df], axis=1)


# Compare Scores between models

In [None]:
results_df.columns

# Step 3. Combine and compare

In [None]:
sns.pairplot(data=final_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='overall',
            palette='tab10')
plt.show()

# Step 4: Review Examples:
##      Positive 1-Star and Negative 5-Star Reviews
###      Lets look at some examples where the model scoring and review score differ the most.

In [None]:
# Most positive 1-star review (according to RoBERTa)
final_df.query('overall == 1') \
    .sort_values('roberta_pos', ascending=False)[['reviewText']].values[0]


In [None]:
# Most positive 1-star review (according to VADER)
final_df.query('overall == 1') \
    .sort_values('vader_pos', ascending=False)[['reviewText']].values[0]


In [None]:
# nevative sentiment 5-Star view

In [None]:
# Most negative sentiment 5-star review (RoBERTa)
final_df.query('overall == 5') \
    .sort_values('roberta_neg', ascending=False)['reviewText'].values[0]


In [None]:
# Most negative sentiment 5-star review (VADER)
final_df.query('overall == 5') \
    .sort_values('vader_neg', ascending=False)['reviewText'].values[0]


#  Extra: The Transformers Pipeline
##        Quick & easy way to run sentiment predictions

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

In [None]:
sent_pipeline("I love sentiment analysis!")
# ➜ [{'label': 'POSITIVE', 'score': 0.999...}]


In [None]:
sent_pipeline("booo")
# ➜ [{'label': 'NEGATIVE', 'score': 0.998...}]


# The End