In [1]:
import requests
from bs4 import BeautifulSoup


In [2]:
r = requests.get('https://www.yelp.com/biz/bui-nails-lounge-lincoln?rr=1')

In [3]:
r.status_code

200

In [4]:
r.text



In [5]:
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
divs = soup.findAll(class_= "margin-b2__09f24__CEMjT border-color--default__09f24__NPAKY")

In [7]:
reviews = []
for div in divs:
    p_tag = div.find('p', class_='comment__09f24__D0cxf css-qgunke')
    if p_tag:
        span_tag = p_tag.find('span', class_='raw__09f24__T4Ezm')
        if span_tag:
            reviews.append(span_tag.text)

In [8]:
reviews

["The shape itself was fine but do not get any designs it was awful and cheap looking but they charged me 85 bucks for a dip powder full set. The flowers on the nails had huge gaps between the inside and the actual petals. When I went to take them off my nails were super ruined underneath. I'm now happier wearing press on nails while they heal. The photos I attached were what I wanted vs what I got. I wanted light pearls scattered around but instead it was put so strange and the pearls faded off after handwash so it was a super gross looking by then. I was under the impression it was 50 and I figured it was fine for 50 but for 85 sorry not worth it.",
 "I wanted to try this new salon so bad since it's new to the community. But their receptionist is HORRIBLE, she has a very condescending tone & and sarcastic attitude when being told constructive criticism regarding the business. She will literally ask you if you can go on hold to immediately just put you on hold without waiting for your

## ANALYZING DATA

In [9]:
import pandas as pd
import numpy as np

In [10]:
# Putting our 'reviews' into an array so pandas can interpret better
df = pd.DataFrame(np.array(reviews), columns=['review'])

In [11]:
df.head()

Unnamed: 0,review
0,The shape itself was fine but do not get any d...
1,I wanted to try this new salon so bad since it...
2,Thinking of a pedicure? DON'T go to BUI!! I ha...
3,Update: See the argumentative response from th...
4,I give this place a zero having an appointment...


In [13]:
# This code is to see how many reviews there are
len(df['review'])

10

In [17]:
# Checking the lengths of each review along with word count
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

In [18]:
df.head()

Unnamed: 0,review,word_count
0,The shape itself was fine but do not get any d...,131
1,I wanted to try this new salon so bad since it...,140
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105
3,Update: See the argumentative response from th...,104
4,I give this place a zero having an appointment...,107


In [20]:
# Counting the characters
df['char_count'] = df['review'].apply(lambda x: len (x))

In [21]:
df.head()

Unnamed: 0,review,word_count,char_count
0,The shape itself was fine but do not get any d...,131,655
1,I wanted to try this new salon so bad since it...,140,799
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587
3,Update: See the argumentative response from th...,104,594
4,I give this place a zero having an appointment...,107,553


In [22]:
# Finding the average word amount per review
def average_words(x):
    words = x.split() #this will give us the number of words
    return sum(len(word) for word in words) / len(words)

In [24]:
# Printing out the average words per review
df['average_word_length'] = df['review'].apply(lambda x:average_words(x))

In [25]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length
0,The shape itself was fine but do not get any d...,131,655,4.007634
1,I wanted to try this new salon so bad since it...,140,799,4.714286
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6
3,Update: See the argumentative response from th...,104,594,4.701923
4,I give this place a zero having an appointment...,107,553,4.17757


## COUNTING STOP WORDS

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trini\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [29]:
from nltk.corpus import stopwords

In [30]:
stop_words = stopwords.words('english')

In [32]:
len(stop_words)

179

In [34]:
# How many stop words are in our reviews?
df['stopword_count'] = df['review'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [35]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count
0,The shape itself was fine but do not get any d...,131,655,4.007634,69
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46
3,Update: See the argumentative response from th...,104,594,4.701923,48
4,I give this place a zero having an appointment...,107,553,4.17757,62


In [36]:
df['stopword_rate'] = df['stopword_count'] / df['word_count']

In [37]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439


In [38]:
# sorting the stopword_rate by ascending order
df.sort_values(by='stopword_rate')

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095
9,Biggest waste of time! Took 4 hours for a mani...,69,353,4.130435,31,0.449275
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538
6,I was debating on posting this but it's still ...,458,2376,4.189956,219,0.478166
5,There are lots of nail places out there. Don't...,35,186,4.342857,17,0.485714
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5
7,Went with two other friends we asked for the s...,123,635,4.170732,64,0.520325
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718
8,I made an appointment three days in advance fo...,176,918,4.176136,95,0.539773
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439


In [39]:
df.describe()

Unnamed: 0,word_count,char_count,average_word_length,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,144.8,765.6,4.321153,72.1,0.497904
std,116.540694,601.82467,0.256887,56.048491,0.044147
min,35.0,186.0,4.007634,17.0,0.438095
25%,104.25,561.5,4.172083,46.5,0.465695
50%,115.0,614.5,4.183763,63.0,0.492857
75%,137.75,763.0,4.535714,69.75,0.525119
max,458.0,2376.0,4.714286,219.0,0.579439


## DATA CLEANING PROCESS

In [41]:
# lowercase each and every word in the review (makes it easier to remove stop words)
df['lowercase'] = df['review'].apply(lambda x: " ".join(word.lower() for word in x.split()))

In [42]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,the shape itself was fine but do not get any d...
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,i wanted to try this new salon so bad since it...
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,thinking of a pedicure? don't go to bui!! i ha...
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,update: see the argumentative response from th...
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,i give this place a zero having an appointment...


In [47]:
# Remove all punctuations 
import re
df['punctuation'] = df['lowercase'] = df['lowercase'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [48]:
df['punctuation']

0    the shape itself was fine but do not get any d...
1    i wanted to try this new salon so bad since it...
2    thinking of a pedicure dont go to bui i had a ...
3    update see the argumentative response from the...
4    i give this place a zero having an appointment...
5    there are lots of nail places out there dont p...
6    i was debating on posting this but its still t...
7    went with two other friends we asked for the s...
8    i made an appointment three days in advance fo...
9    biggest waste of time took 4 hours for a manip...
Name: punctuation, dtype: object

In [49]:
# removing stop words
from nltk.corpus import stopwords

In [50]:
stop_words = stopwords.words('english')

In [55]:
# looping through each word in our review, if the word is not in the stop word list we will return it, if not it will not return
df['stopwords'] = df['punctuation'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

In [56]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,the shape itself was fine but do not get any d...,the shape itself was fine but do not get any d...,shape fine get designs awful cheap looking cha...
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,i wanted to try this new salon so bad since it...,i wanted to try this new salon so bad since it...,wanted try new salon bad since new community r...
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,thinking of a pedicure dont go to bui i had a ...,thinking of a pedicure dont go to bui i had a ...,thinking pedicure dont go bui 12pm appointment...
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,update see the argumentative response from the...,update see the argumentative response from the...,update see argumentative response manager sayi...
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,i give this place a zero having an appointment...,i give this place a zero having an appointment...,give place zero appointment waiting call upset...


In [60]:
# creating an array of every single word of our reviews
pd.Series(" ". join(df['stopwords']).split()).value_counts()[:30] # by doing value_counts, it shows us how many times each word comes up in our reviews

nail            11
nails           10
said             9
salon            8
didnt            7
time             7
asked            6
appointment      6
dont             6
back             6
new              5
one              5
wait             5
feel             5
pedicure         5
c                5
charged          5
know             5
bad              5
two              4
go               4
receptionist     4
business         4
week             4
less             4
came             4
theres           4
ive              4
gel              4
think            4
Name: count, dtype: int64

In [69]:
other_stop_words = ['c', 'theres', 'ive', 'one', 'go', 'know', 'two', 'zero']

In [70]:
len(other_stop_words)

8

In [71]:
# Only returning words that are not in other_stop_words
df['cleanreview'] = df['stopwords'].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

In [72]:
pd.Series(" ".join(df['cleanreview']).split()).value_counts()[:30]

nail            11
nails           10
said             9
salon            8
time             7
didnt            7
back             6
appointment      6
dont             6
asked            6
new              5
charged          5
bad              5
wait             5
pedicure         5
feel             5
think            4
came             4
gel              4
week             4
receptionist     4
business         4
less             4
still            4
understand       4
someone          4
also             4
looking          4
place            3
good             3
Name: count, dtype: int64

In [73]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleanreview
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,the shape itself was fine but do not get any d...,the shape itself was fine but do not get any d...,shape fine get designs awful cheap looking cha...,shape fine get designs awful cheap looking cha...
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,i wanted to try this new salon so bad since it...,i wanted to try this new salon so bad since it...,wanted try new salon bad since new community r...,wanted try new salon bad since new community r...
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,thinking of a pedicure dont go to bui i had a ...,thinking of a pedicure dont go to bui i had a ...,thinking pedicure dont go bui 12pm appointment...,thinking pedicure dont bui 12pm appointment di...
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,update see the argumentative response from the...,update see the argumentative response from the...,update see argumentative response manager sayi...,update see argumentative response manager sayi...
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,i give this place a zero having an appointment...,i give this place a zero having an appointment...,give place zero appointment waiting call upset...,give place appointment waiting call upsetting ...


## DOING LEMMATIZATION TO CONDENSE WORDS DOWN TO THEIR BASE FORMAT

In [76]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\trini\AppData\Roaming\nltk_data...


True

In [77]:
from textblob import Word

In [79]:
df['lemmatized'] = df['cleanreview'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [80]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleanreview,lemmatized
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,the shape itself was fine but do not get any d...,the shape itself was fine but do not get any d...,shape fine get designs awful cheap looking cha...,shape fine get designs awful cheap looking cha...,shape fine get design awful cheap looking char...
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,i wanted to try this new salon so bad since it...,i wanted to try this new salon so bad since it...,wanted try new salon bad since new community r...,wanted try new salon bad since new community r...,wanted try new salon bad since new community r...
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,thinking of a pedicure dont go to bui i had a ...,thinking of a pedicure dont go to bui i had a ...,thinking pedicure dont go bui 12pm appointment...,thinking pedicure dont bui 12pm appointment di...,thinking pedicure dont bui 12pm appointment di...
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,update see the argumentative response from the...,update see the argumentative response from the...,update see argumentative response manager sayi...,update see argumentative response manager sayi...,update see argumentative response manager sayi...
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,i give this place a zero having an appointment...,i give this place a zero having an appointment...,give place zero appointment waiting call upset...,give place appointment waiting call upsetting ...,give place appointment waiting call upsetting ...


## SENTIMENT ANALYSIS

In [81]:
from textblob import TextBlob

In [83]:
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])

In [84]:
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])

In [85]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleanreview,lemmatized,polarity,subjectivity
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,the shape itself was fine but do not get any d...,the shape itself was fine but do not get any d...,shape fine get designs awful cheap looking cha...,shape fine get designs awful cheap looking cha...,shape fine get design awful cheap looking char...,0.128571,0.128571
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,i wanted to try this new salon so bad since it...,i wanted to try this new salon so bad since it...,wanted try new salon bad since new community r...,wanted try new salon bad since new community r...,wanted try new salon bad since new community r...,0.006612,0.006612
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,thinking of a pedicure dont go to bui i had a ...,thinking of a pedicure dont go to bui i had a ...,thinking pedicure dont go bui 12pm appointment...,thinking pedicure dont bui 12pm appointment di...,thinking pedicure dont bui 12pm appointment di...,0.137273,0.137273
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,update see the argumentative response from the...,update see the argumentative response from the...,update see argumentative response manager sayi...,update see argumentative response manager sayi...,update see argumentative response manager sayi...,0.456818,0.456818
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,i give this place a zero having an appointment...,i give this place a zero having an appointment...,give place zero appointment waiting call upset...,give place appointment waiting call upsetting ...,give place appointment waiting call upsetting ...,0.2,0.2


In [91]:
# Dropping columns we don't need
df.drop(['lowercase', 'punctuation', 'stopwords', 'cleanreview', 'lemmatized'], axis=1, inplace=True)

In [93]:
df.sort_values(by='polarity')

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,polarity,subjectivity
5,There are lots of nail places out there. Don't...,35,186,4.342857,17,0.485714,-0.132727,-0.132727
8,I made an appointment three days in advance fo...,176,918,4.176136,95,0.539773,-0.054895,-0.054895
6,I was debating on posting this but it's still ...,458,2376,4.189956,219,0.478166,-0.002017,-0.002017
1,I wanted to try this new salon so bad since it...,140,799,4.714286,70,0.5,0.006612,0.006612
0,The shape itself was fine but do not get any d...,131,655,4.007634,69,0.526718,0.128571,0.128571
2,Thinking of a pedicure? DON'T go to BUI!! I ha...,105,587,4.6,46,0.438095,0.137273,0.137273
7,Went with two other friends we asked for the s...,123,635,4.170732,64,0.520325,0.16,0.16
9,Biggest waste of time! Took 4 hours for a mani...,69,353,4.130435,31,0.449275,0.2,0.2
4,I give this place a zero having an appointment...,107,553,4.17757,62,0.579439,0.2,0.2
3,Update: See the argumentative response from th...,104,594,4.701923,48,0.461538,0.456818,0.456818
