<a href="https://colab.research.google.com/github/Uvasrisa/aidTec_Sentiment_analysis-/blob/main/Sentiment_Analysis_with_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
import numpy as np
import pandas as pd

Importing Datasets

In [None]:
#Loading the dataset
dump = pd.read_csv('a1_AmazonAlexa_ReviewsDataset.tsv',sep='\t')

dump

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


Data Preprocessing

In [None]:
dataset = dump[['verified_reviews','rating']]
dataset.columns = ['Review', 'Sentiment']

dataset.head()

Unnamed: 0,Review,Sentiment
0,Love my Echo!,5
1,Loved it!,5
2,"Sometimes while playing a game, you can answer...",4
3,I have had a lot of fun with this thing. My 4 ...,5
4,Music,5


In [None]:
# Creating a new column sentiment based on overall ratings
def compute_sentiments(labels):
  sentiments = []
  for label in labels:
    if label > 3.0:
      sentiment = 1
    elif label <= 3.0:
      sentiment = 0
    sentiments.append(sentiment)
  return sentiments

In [None]:
dataset['Sentiment'] = compute_sentiments(dataset.Sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Sentiment'] = compute_sentiments(dataset.Sentiment)


In [None]:
dataset.head()

Unnamed: 0,Review,Sentiment
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [None]:
# check distribution of sentiments
dataset['Sentiment'].value_counts()

1    2741
0     409
Name: Sentiment, dtype: int64

In [None]:
# check for null values
dataset.isnull().sum()

# no null values in the data

Review       0
Sentiment    0
dtype: int64

Data Transformation

In [None]:
x = dataset['Review']
y = dataset['Sentiment']

In [None]:
# import tokenizer_input
from b2_tokenizer_input import CustomTokenizerExample

In [None]:
! cat b2_tokenizer_input.py

import spacy
nlp = spacy.load('en_core_web_sm')

import string
punct = string.punctuation
# punct

from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS) # list of stopwords

class CustomTokenizerExample():
    def __init__(self):
        pass

    def text_data_cleaning(self,sentence):
        doc = nlp(sentence)                         # spaCy tokenize text & call doc components, in order

        tokens = [] # list of tokens
        for token in doc:
            if token.lemma_ != "-PRON-":
                temp = token.lemma_.lower().strip()
            else:
              temp = token.lower_
            tokens.append(temp)

        cleaned_tokens = []
        for token in tokens:
            if token not in stopwords and token not in punct:
                cleaned_tokens.append(token)
        return cleaned_tokens

In [None]:
# let's do a test
token = CustomTokenizerExample()
token.text_data_cleaning("Those were the best days of my life!")

['good', 'day', 'life']

Feature Engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(tokenizer=token.text_data_cleaning)
# tokenizer=text_data_cleaning, tokenization will be done according to this function

Train the model

Train / Test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = dataset.Sentiment, random_state = 0)

In [None]:
x_train.shape, x_test.shape
# 2520 samples in training dataset and 630 in test dataset

((2520,), (630,))

Fix x_train y_train

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [None]:
classifier = LinearSVC()

In [None]:
pipeline = Pipeline([('tfidf',tfidf), ('clf',classifier)])
# it will first do vectorization and then it will do classification

In [None]:
pipeline.fit(x_train, y_train)



Check Model Performance

In [None]:
y_pred = pipeline.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

# we are getting almost 91% accuracy

array([[ 37,  45],
       [ 11, 537]])

In [None]:
# classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.45      0.57        82
           1       0.92      0.98      0.95       548

    accuracy                           0.91       630
   macro avg       0.85      0.72      0.76       630
weighted avg       0.90      0.91      0.90       630



In [None]:
# round(accuracy_score(y_test, y_pred)*100,2)

Model Serialization

In [None]:
import joblib
joblib.dump(pipeline,'c1_SentimentAnalysis_Model_Pipeline.pkl')

['c1_SentimentAnalysis_Model_Pipeline.pkl']

Predict Sentiments using model

Simple way

In [None]:
prediction = pipeline.predict(["Alexa is good"])

if prediction == 1:
  print("Result: This review is positive")
else:
  print("Result: This review is negative")

Result: This review is positive


Fancy way

In [None]:
new_review = []
pred_sentiment = []

while True:

  # ask for a new amazon alexa review
  review = input("Please type an Alexa review - ")

  if review == 'skip':
    print("See you soon!")
    break
  else:
    prediction = pipeline.predict([review])

    if prediction == 1:
      result = 'Positive'
      print("Result: This review is positive\n")
    else:
      result = 'Negative'
      print("Result: This review is negative\n")

  new_review.append(review)
  pred_sentiment.append(result)

In [None]:
Results_Summary = pd.DataFrame(
    {'New Review': new_review,
     'Sentiment': pred_sentiment,
    })

Results_Summary.to_csv("./c2_Predicted_Sentiments.tsv", sep='\t', encoding='UTF-8', index=False)
Results_Summary