# NLP Sentiment Analysis

---

## Required Dependencies

In [3]:
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn
# !pip install nltk

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


## Environment Setup

In [6]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

## Import Data

### Import JSON Files provided by [![yelp.com](https://raw.githubusercontent.com/tbgrun/tbgrun.github.io/main/assets/analyses/nlp_sentiment_analysis/yelp_badge.png)](https://www.yelp.com) throug [Yelp Open Dataset](https://www.yelp.com/dataset)

In [7]:
business_raw = pd.read_json('yelp_dataset/yelp_academic_dataset_business.json', lines=True)
review_raw = pd.read_json('yelp_dataset/yelp_academic_dataset_review.json', lines=True)

### Data Extraction
* transform date into usable format
* drop unnecessary columns

In [68]:
print(business_raw.head())
business_dropped = business_raw.drop(['latitude', 'longitude', 'review_count', 'categories'], axis=1)
business_dropped.rename(columns={'stars_x': 'overall_rating'}, inplace=True)

print(review_raw.head())
review_raw['date'] = pd.to_datetime(review_raw['date'])
review_raw['month'] = review_raw['date'].dt.month
review_raw['year'] = review_raw['date'].dt.year
review_dropped = review_raw.drop(['funny', 'cool', 'date'], axis=1)
review_dropped.rename(columns={'stars_y': 'review_rating'}, inplace=True)

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

### Merge Data
* merge data frames into a single data frame

In [9]:
df_merged = pd.merge(business_dropped, review_dropped, on='business_id', how='inner')

### Select Data
* Business: First Watch
* Location: Florida (State)
* Review dates: June 2022 through September 2022

#### Spell check 'First Watch'

In [10]:
check_for_firstwatch = 'First Watch' in df_merged['name'].values
count_firstwatch = df_merged['name'].eq('First Watch').sum()
count_firstwatch_variants = df_merged['name'].str.contains(r'\bfirst watch\b', case=False, na=False).sum()
print(f'data frame contains First Watch: {check_for_firstwatch}')
print(f'no variants of "First Watch" found: {count_firstwatch==count_firstwatch_variants}')

data frame contains First Watch: True
no variants of "First Watch" found: True


#### Data extraction

In [11]:
df_raw = df_merged[(df_merged['state'] == 'FL') & 
    (df_merged['name'] == 'First Watch') & 
    (df_merged['year'] == 2019) &
    (df_merged['month'] >= 6) &
    (df_merged['month'] <= 9)
]

## EDA and Data Wrangling

In [12]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 105643 to 6984780
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   business_id  244 non-null    object 
 1   name         244 non-null    object 
 2   address      244 non-null    object 
 3   city         244 non-null    object 
 4   state        244 non-null    object 
 5   postal_code  244 non-null    object 
 6   stars_x      244 non-null    float64
 7   is_open      244 non-null    int64  
 8   attributes   244 non-null    object 
 9   hours        244 non-null    object 
 10  review_id    244 non-null    object 
 11  user_id      244 non-null    object 
 12  stars_y      244 non-null    int64  
 13  useful       244 non-null    int64  
 14  text         244 non-null    object 
 15  month        244 non-null    int32  
 16  year         244 non-null    int32  
dtypes: float64(1), int32(2), int64(3), object(11)
memory usage: 32.4+ KB


In [21]:
columns = df_raw.shape
uniques = df_raw['text'].nunique()
print(f'only unique text entries in the data frame: {uniques==columns[0]}')

only unique text entries in the data frame: True


In [33]:
df_raw['city'].value_counts()

city
Tampa              48
St. Petersburg     42
Wesley Chapel      27
Riverview          24
Clearwater         22
Lutz               20
Oldsmar            15
New Port Richey    12
Brandon            12
Palm Harbor        11
Largo              11
Name: count, dtype: int64

In [65]:

ratings = df_raw.groupby('business_id')['stars_y'].agg(
    min_rating='min',
    max_rating='max',
    median_rating='median',
       mad=lambda x: abs(x-x.median()).median()
).reset_index()
ratings

Unnamed: 0,business_id,min_rating,max_rating,median_rating,mad
0,1u2IlvwOMndfuRk9EVcieQ,2,5,5.0,0.0
1,Awkrb1wzFxz5w1Ixc9W3FA,2,5,4.0,1.0
2,FMUvCniLe8m8BHSYUsnH2w,2,5,5.0,0.0
3,I7SkoqN88fpKagzKA059Fw,1,5,5.0,0.0
4,IDU-TR01EtRLCMvI_CcOyQ,1,5,5.0,0.0
5,QdUbpCOvFbYb_RzlPxOonQ,1,5,4.0,1.0
6,RhBsbZ8N_bSH-jD9X9FFZg,2,5,5.0,0.0
7,Scd-rcsQCn60t1sHHFv-og,2,5,5.0,0.0
8,VX7GkJDnhPG8NmrdO1V-6Q,2,5,4.5,0.5
9,VgPDZyDLIWe4NVetnvjqig,1,5,4.5,0.5


In [None]:
df