#Sentiment Analysis in Python

In this notebook we will be doing some sentiment analysis in python using two different techniques:

  1.VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach

  2.Roberta Pretrained Model from 🤗

  3.Huggingface Pipeline

#Step 0: Read in Data and NLTK basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
#nltk is natural language toolkit, used to work with human language data.

In [None]:
#read in data
df=pd.read_csv('/content/Reviews.csv')

In [None]:
df.head()

In [None]:
df['Text'].values[0]

In [None]:
print(df.shape)

In [None]:
df=df.head(500)

In [None]:
df.shape

In [None]:
df.head()

#Quick EDA

In [None]:
ax=df['Score'].value_counts().sort_index()\
  .plot(kind='bar',
        title='Count of Reviews by Stars',
        figsize=(10,5))
ax.set_xlabel('Review Stars')
plt.show()

#Basic NLTK

In [None]:
example=df['Text'][50]
print(example)

In [None]:
!pip install svgling

In [None]:
import nltk
import svgling
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('vader_lexicon')
#these are all the libraries used and i had to download from nltk

In [None]:
tokens=nltk.word_tokenize(example)
tokens[:10]
#splits sentences into words, keeping punctuation as it is

In [None]:
tagged=nltk.pos_tag(tokens)
tagged[:10]

In [None]:
entities=nltk.chunk.ne_chunk(tagged)
entities.pprint()
#we basically divided words into chunks, considering their parts of speech tags.

#Step1: VADER Sentiment Scoring

We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.

This uses a "bag of words" approach:

*   Stop words are removed
*   each word is scored and combined to a total score.





In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia=SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')
#tells us how positive or negative a sentence is

In [None]:
sia.polarity_scores('I am a loser.')

In [None]:
sia.polarity_scores(example)

In [None]:
#run the polarity score on entire dataset
res={}
for i, row in tqdm(df.iterrows(),total=len(df)):
  text=row['Text']
  myid=row['Id']
  res[myid]=sia.polarity_scores(text)

In [None]:
vaders=pd.DataFrame(res).T
vaders=vaders.reset_index().rename(columns={'index':'Id'})
vaders=vaders.merge(df,how='left')

In [None]:
#now we have sentiment score and metadata
vaders.head()

#Plot VADER Results

In [None]:
ax=sns.barplot(data=vaders,x='Score',y='compound')
ax.set_title('Compund Score by Amaazon Star Review')
plt.show()

In [None]:
fig,axs=plt.subplots(1,3,figsize=(12,3))
sns.barplot(data=vaders,x='Score',y='pos',ax=axs[0])
sns.barplot(data=vaders,x='Score',y='neu',ax=axs[1])
sns.barplot(data=vaders,x='Score',y='neg',ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

#Step 3: Roberta Pretrained Model



*   Use a model trained of a large corpus of data.
*   Transformer model accounts for the words but also the context related to other words.







In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#here we used a predefined transformer model

In [None]:
#vader results on example
print(example)
sia.polarity_scores(example)

In [None]:
#run for roberta model
encoded_text=tokenizer(example,return_tensors='pt')
output=model(**encoded_text)
scores=output[0][0].detach().numpy()
scores=softmax(scores)
scores_dict={
    'roberta_neg':scores[0],
    'roberta_neu':scores[1],
    'roberta_pos':scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
  encoded_text=tokenizer(example,return_tensors='pt')
  output=model(**encoded_text)
  scores=output[0][0].detach().numpy()
  scores=softmax(scores)
  scores_dict={
    'roberta_neg':scores[0],
    'roberta_neu':scores[1],
    'roberta_pos':scores[2]
}
  return scores_dict

In [None]:
res={}
for i, row in tqdm(df.iterrows(),total=len(df)):
  try:
    text=row['Text']
    myid=row['Id']
    vader_result=sia.polarity_scores(text)
    vader_result_rename = {}
    for key, value in vader_result.items():
        vader_result_rename[f"vader_{key}"] = value

    roberta_result=polarity_scores_roberta(text)
    both={**vader_result, **roberta_result}
    res[myid]=both
  except RuntimeError:
    print(f'Broke for id{myid}')

In [None]:
both

In [None]:
results_df=pd.DataFrame(res).T
results_df=results_df.reset_index().rename(columns={'index':'Id'})
results_df=results_df.merge(df,how='left')

In [None]:
results_df.head()

#Compare Scores

In [None]:
results_df.columns

In [None]:
sns.pairplot(data=results_df,
             vars=['neg', 'neu', 'pos', 'roberta_neg', 'roberta_neu',
                  'roberta_pos'],
             hue='Score',
             palette='tab10')
plt.show()

#Step 4: Review Examples

*   Positive 1-Star and Negative 5-Star Reviews

Lets look at some examples where the model scoring and review score differ the most.

In [None]:
results_df.query('Score==1').sort_values('roberta_pos',ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score==1').sort_values('pos',ascending=False)['Text'].values[0]

In [None]:
#negative sentiment 5 star view
results_df.query('Score==5').sort_values('roberta_neg',ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score==5').sort_values('neg',ascending=False)['Text'].values[0]

#Extra: The transformers Pipeline

*  Quick & easy way to run sentimental predictions



In [None]:
from transformers import pipeline

sent_pipeline=pipeline("sentiment-analysis")

In [None]:
sent_pipeline('I love sentiment analysis!')

In [None]:
sent_pipeline('I hate coding!')