In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk 
import seaborn as sns

plt.style.use("ggplot")# stylesheet (it is the colors or the way of repsresntation of graphs in different ways)used for graph plotting 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

HERE two techniques are using (i.)VADER and (ii.)ROBERTA model (by hugging face ) and finally use a hugging face pipelines

In [None]:
df=pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")
df.head()
df.shape# gives the total data with rows and columns
#df.["Text"].value[0] # gives me the text column first entry 
# here the summary contains the data to be reviewed or used  and score is the stars given 

# scaling down the data by selecting only 500 entries
df=df.head(500)
df.shape

In [None]:
ax=df["Score"].value_counts().sort_index().plot(kind="bar",title="Count of Reviews",figsize=(10,5))
ax.set_xlabel("RATINGS STARS")

# Step 1. VADER Seniment Scoring
We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text. (it doesn't account for the relationship betweenn the data )

This uses a "bag of words" approach:
1.Stop words are removed 
2.each word is scored and combined to a total score.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm   # used to show the progress report or bars 

sia=SentimentIntensityAnalyzer()

In [None]:
# sia.polarity_scores("I am sad")
sia.polarity_scores("I am happy") 
#  compound is the average of all and gives an overall view of the sentences /words

In [None]:
#  running on the data set 
res={}
for i, row in tqdm(df.iterrows(),total=len(df)):
    text=row["Text"]# text contains the comments to be analysed 
    myid=row["Id"]
    res[myid]=sia.polarity_scores(text)

In [None]:
# converting into pandas dataframe
# pd.DataFrame(res)# pandas acan easily intake the dictonaries
vaders=pd.DataFrame(res).T # TO GET IN VERTICAL TABLES FORM .T IS used
vaders.head()

In [None]:
#  changing the index to id and the merging into original datset 
vaders=vaders.reset_index().rename(columns={"index":"Id"})
vaders=vaders.merge(df,how="left")
vaders

In [None]:
#  plotting the results 
ax = sns.barplot(data=vaders,x="Score",y="compound")
fig,axs=plt.subplots(1,3,figsize=(12,3))
sns.barplot(data=vaders,x="Score",y="pos",ax=axs[0])
sns.barplot(data=vaders,x="Score",y="neu",ax=axs[1])
sns.barplot(data=vaders,x="Score",y="pos",ax=axs[2])
axs[0].set_title("positive")
axs[1].set_title("Neutral")
axs[2].set_title("Negative")

# roberta pretrained model

In [None]:
from transformers import AutoTokenizer # taking from hugging faces
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
#  doing transfer learning we are taking trained weights and using that in our mmodel
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt') # encoding the text into ,
    output = model(**encoded_text) # gives into tensor
    scores = output[0][0].detach().numpy() # convrtedd into numpy 
    scores = softmax(scores) #Softmax is a mathematical function that converts a vector of numbers into a vector of probabilities, where the probabilities of each value are proportional to the relative scale of each value in the vector
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

In [None]:
results_df=pd.DataFrame(res).T
results_df=results_df.reset_index().rename(columns={"index":"Id"})
results_df= results_df.merge(df,how="left")
results_df.columns

In [None]:
# pairplot used for comparison 
sns.pairplot(data=results_df,vars=['vader_neg', 'vader_neu', 'vader_pos','roberta_neg', 'roberta_neu', 'roberta_pos'],hue="Score",palette="tab10")
plt.show()                                   

In [None]:
#  comments are positive but still the user gave 1 star
results_df.query("Score==1")\
 .sort_values("roberta_pos",ascending=False)["Text"].values[0]

In [None]:
#  negative sentiment but 5  star
results_df.query("Score==5")\
.sort_values("roberta_neg",ascending=False)["Text"].values[0]

# using hugging face pipelines

In [None]:
from transformers import pipeline
sentiment_pipeline =pipeline("sentiment-analysis")

In [None]:
# sentiment_pipeline("I love u")
# sentiment_pipeline("I hate u")
sentiment_pipeline("oops")

