<a href="https://colab.research.google.com/github/christophermalone/DSCI325/blob/main/NLP_DataPrep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP - Data Prep

In [25]:
# install and import pandas library
import pandas as pd

# Creating a pandas dataframe from reviews.txt file
#Reviews = pd.read_csv('/content/sample_data/AmazonReviews_Good_Final.csv', sep=',')
Reviews = pd.read_csv('/content/sample_data/AmazonReviews_Poor_Final.csv', sep=',')
Reviews.head()

Unnamed: 0,id,profileName,date,review,rating,helpful
0,R3MZRW67QAA2ZG,Get-a-long Gang,"Reviewed in the United States on May 9, 2018",it’s sprays green however it’s green liquid no...,1,609
1,R21KMZ4ZOZHSSA,Vincent G. Baker,"Reviewed in the United States on April 30, 2016",Complete waste of time and money,1,1
2,R1GYHC9PRPS419,Kenny,"Reviewed in the United States on July 24, 2016",Garbage. Do not buy! Doesn't work and complete...,1,816
3,RM6O1US7MGXUC,Amazon Customer,"Reviewed in the United States on May 30, 2017",One Star,1,469
4,R2WYXNIEZHZN9,Harold,"Reviewed in the United States on May 28, 2018","Waste of money, The commercial is false advert...",1,323


Getting rid of all non-alphabetic characters

In [26]:
import re

# Define a function to clean the text
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text)) 
    return text

# Cleaning the text in the review column
Reviews['CleanedReviews'] = Reviews['review'].apply(clean)
Reviews.head()

Unnamed: 0,id,profileName,date,review,rating,helpful,CleanedReviews
0,R3MZRW67QAA2ZG,Get-a-long Gang,"Reviewed in the United States on May 9, 2018",it’s sprays green however it’s green liquid no...,1,609,it s sprays green however it s green liquid no...
1,R21KMZ4ZOZHSSA,Vincent G. Baker,"Reviewed in the United States on April 30, 2016",Complete waste of time and money,1,1,Complete waste of time and money
2,R1GYHC9PRPS419,Kenny,"Reviewed in the United States on July 24, 2016",Garbage. Do not buy! Doesn't work and complete...,1,816,Garbage Do not buy Doesn t work and completely...
3,RM6O1US7MGXUC,Amazon Customer,"Reviewed in the United States on May 30, 2017",One Star,1,469,One Star
4,R2WYXNIEZHZN9,Harold,"Reviewed in the United States on May 28, 2018","Waste of money, The commercial is false advert...",1,323,Waste of money The commercial is false adverti...


 Tokenization, POS tagging, stopwords removal

In [27]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

Reviews['POS_Tagged'] = Reviews['CleanedReviews'].apply(token_stop_pos)
Reviews.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,id,profileName,date,review,rating,helpful,CleanedReviews,POS_Tagged
0,R3MZRW67QAA2ZG,Get-a-long Gang,"Reviewed in the United States on May 9, 2018",it’s sprays green however it’s green liquid no...,1,609,it s sprays green however it s green liquid no...,"[(sprays, n), (green, a), (however, r), (green..."
1,R21KMZ4ZOZHSSA,Vincent G. Baker,"Reviewed in the United States on April 30, 2016",Complete waste of time and money,1,1,Complete waste of time and money,"[(Complete, a), (waste, n), (time, n), (money,..."
2,R1GYHC9PRPS419,Kenny,"Reviewed in the United States on July 24, 2016",Garbage. Do not buy! Doesn't work and complete...,1,816,Garbage Do not buy Doesn t work and completely...,"[(Garbage, n), (buy, v), (work, n), (completel..."
3,RM6O1US7MGXUC,Amazon Customer,"Reviewed in the United States on May 30, 2017",One Star,1,469,One Star,"[(One, None), (Star, n)]"
4,R2WYXNIEZHZN9,Harold,"Reviewed in the United States on May 28, 2018","Waste of money, The commercial is false advert...",1,323,Waste of money The commercial is false adverti...,"[(Waste, n), (money, n), (commercial, n), (fal..."


In [28]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos: 
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:  
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew
    
Reviews['Lemma'] = Reviews['POS_Tagged'].apply(lemmatize)
Reviews.head()

Unnamed: 0,id,profileName,date,review,rating,helpful,CleanedReviews,POS_Tagged,Lemma
0,R3MZRW67QAA2ZG,Get-a-long Gang,"Reviewed in the United States on May 9, 2018",it’s sprays green however it’s green liquid no...,1,609,it s sprays green however it s green liquid no...,"[(sprays, n), (green, a), (however, r), (green...",spray green however green liquid foam Mousse...
1,R21KMZ4ZOZHSSA,Vincent G. Baker,"Reviewed in the United States on April 30, 2016",Complete waste of time and money,1,1,Complete waste of time and money,"[(Complete, a), (waste, n), (time, n), (money,...",Complete waste time money
2,R1GYHC9PRPS419,Kenny,"Reviewed in the United States on July 24, 2016",Garbage. Do not buy! Doesn't work and complete...,1,816,Garbage Do not buy Doesn t work and completely...,"[(Garbage, n), (buy, v), (work, n), (completel...",Garbage buy work completely break apart st use
3,RM6O1US7MGXUC,Amazon Customer,"Reviewed in the United States on May 30, 2017",One Star,1,469,One Star,"[(One, None), (Star, n)]",One Star
4,R2WYXNIEZHZN9,Harold,"Reviewed in the United States on May 28, 2018","Waste of money, The commercial is false advert...",1,323,Waste of money The commercial is false adverti...,"[(Waste, n), (money, n), (commercial, n), (fal...",Waste money commercial false advertising opi...


## Sentiment via TextBlob

In [29]:
from textblob import TextBlob

# function to calculate subjectivity 
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity

# function to calculate polarity
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [38]:
fin_data = pd.DataFrame(Reviews[['id', 'review', 'helpful', 'Lemma']])

In [39]:
# fin_data['Subjectivity'] = fin_data['Lemma'].apply(getSubjectivity) 
fin_data['Polarity'] = fin_data['Lemma'].apply(getPolarity) 
fin_data['Analysis'] = fin_data['Polarity'].apply(analysis)
fin_data.head()

Unnamed: 0,id,review,helpful,Lemma,Polarity,Analysis
0,R3MZRW67QAA2ZG,it’s sprays green however it’s green liquid no...,609,spray green however green liquid foam Mousse...,-0.1625,Negative
1,R21KMZ4ZOZHSSA,Complete waste of time and money,1,Complete waste time money,-0.05,Negative
2,R1GYHC9PRPS419,Garbage. Do not buy! Doesn't work and complete...,816,Garbage buy work completely break apart st use,0.1,Positive
3,RM6O1US7MGXUC,One Star,469,One Star,0.0,Neutral
4,R2WYXNIEZHZN9,"Waste of money, The commercial is false advert...",323,Waste money commercial false advertising opi...,-0.2,Negative


In [32]:
tb_counts = fin_data.Analysis.value_counts()
tb_counts

Neutral     2070
Negative     869
Positive     525
Name: Analysis, dtype: int64

In [35]:
pip install dfply

Collecting dfply
  Downloading dfply-0.3.3-py3-none-any.whl (612 kB)
[?25l[K     |▌                               | 10 kB 23.9 MB/s eta 0:00:01[K     |█                               | 20 kB 25.4 MB/s eta 0:00:01[K     |█▋                              | 30 kB 30.1 MB/s eta 0:00:01[K     |██▏                             | 40 kB 24.6 MB/s eta 0:00:01[K     |██▊                             | 51 kB 9.1 MB/s eta 0:00:01[K     |███▏                            | 61 kB 10.3 MB/s eta 0:00:01[K     |███▊                            | 71 kB 9.0 MB/s eta 0:00:01[K     |████▎                           | 81 kB 9.8 MB/s eta 0:00:01[K     |████▉                           | 92 kB 10.8 MB/s eta 0:00:01[K     |█████▍                          | 102 kB 8.7 MB/s eta 0:00:01[K     |█████▉                          | 112 kB 8.7 MB/s eta 0:00:01[K     |██████▍                         | 122 kB 8.7 MB/s eta 0:00:01[K     |███████                         | 133 kB 8.7 MB/s eta 0:00:01[K 

In [36]:
from dfply import *

In [60]:
fin_data.id + fin_data.Lemma

0       R3MZRW67QAA2ZG  spray green however green liqu...
1               R21KMZ4ZOZHSSA  Complete waste time money
2       R1GYHC9PRPS419  Garbage buy work completely br...
3                                 RM6O1US7MGXUC  One Star
4       R2WYXNIEZHZN9  Waste money commercial false ad...
                              ...                        
3459    RJ5TSBUDZQD7C  Hydro mousse liquido prato dana...
3460                           R3OJSBIZS92NJZ  Incompleto
3461                             R2DB7BYN5LXRSV  One Star
3462                         R2VY57BGVFO1C0  Tr satisfait
3463                           R2ZAH7BU4ZUKFR  Five Stars
Length: 3464, dtype: object

In [78]:
#Piping in dfply and using filter_by() to grab requested rows.
fin_data_negative = (
                      fin_data
                      >> arrange(X.Polarity)
                      >> filter_by(X.Polarity < -0.50)
                      >> filter_by(X.helpful > 50)
                      >> mutate(WebLink = '<ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/' + X.id + '</li></ul>')
                    )

''.join(map(str, fin_data_negative.WebLink.tolist()))


'<ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R3H1AQ361Z6ABI</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R3BR720UB2EGXU</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R2ZVZP5SJ1L70P</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R18YFZNUOUP0Z5</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/RD2RB62PUZ2OH</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R29MGPQNZ896S7</li></ul><ul><li>Please address the following review: https://www.amazon.com/gp/customer-reviews/R20GW7W6D93UYL</li></ul>'

In [82]:
#Source:  https://towardsdatascience.com/automate-sending-emails-with-gmail-in-python-449cc0c3c317

!pip install yagmail
import yagmail

user = 'mystatteacher@gmail.com'
app_password = 'rqrhdngcyiquuank' # a token for gmail
to = 'sbergen@winona.edu'

subject = 'Test Email - Python'
#content = ['Please address with following review: https://www.amazon.com/gp/customer-reviews/R1GJUJSPBI0OVM']
content = 'Here are the your Amazon reviews that require your attention:<br><br>' + ''.join(map(str, fin_data_negative.WebLink.tolist()))


with yagmail.SMTP(user, app_password) as yag:
    yag.send(to, subject, content)
    print('Sent email successfully')


Sent email successfully


https://www.amazon.com/gp/customer-reviews/R21KMZ4ZOZHSSA

https://www.amazon.com/gp/customer-reviews/R1GJUJSPBI0OVM



