In [None]:
%pip install textblob
%shell python -m textblob.download_corpora
%pip install wordcloud
%pip install textatistic
%pip install spacy
%shell python -m spacy download en_core_web_sm

# Natural Language Processing (NLP)
* NLP is the processing of a text collection
    * Text collection: a corpus (corpora for plural)
    * Examples
        * Your responses to free response questions on surveys
        * Social media posts
        * Messaging conversations
        * Books, articles, blog posts, ...
        * Etc.
* NLP is notoriously difficulty because natural language lacks mathematical precision
* Thankfully, there are alot of really great Python libraries for NLP!!!
    * Let's see some of these in action with business reviews from the [Yelp API](https://www.yelp.com/developers/documentation/v3/authentication)
    
# NLP Demo w/Yelp API

In [None]:
import requests
import json

# documentation at https://www.yelp.com/developers/documentation/v3/authentication
api_key = "YOUR API KEY HERE"
headers = {"Authorization": "Bearer " + api_key}
    
def get_place_id(search_term):
    # documentation at https://www.yelp.com/developers/documentation/v3/business_search
    url = "https://api.yelp.com/v3/businesses/search"
    url += "?term=" + search_term
    # GU GPS coords
    url += "&latitude=47.6670357"
    url += "&longitude=-117.403623"
    
    response = requests.get(url=url, headers=headers)
    json_object = json.loads(response.text)

    place_id = json_object["businesses"][0]["id"]
    print(search_term, "id:", place_id)
    
    return place_id
    
def get_reviews(search_term):
    # documentation at https://www.yelp.com/developers/documentation/v3/business_reviews
    # returns three reviews in yelp "default order"
    place_id = get_place_id(search_term)
    url = "https://api.yelp.com/v3/businesses/" + place_id + "/reviews"
    response = requests.get(url=url, headers=headers)
    json_object = json.loads(response.text)
    reviews_array = json_object["reviews"]
    reviews_text = []
    for review_object in reviews_array:
        reviews_text.append(review_object["text"])
    return reviews_text

reviews_dict = {"mango+tree": get_reviews("mango+tree"), "aloha+island+grill": get_reviews("aloha+island+grill")}

In [None]:
for name, reviews in reviews_dict.items():
    print("***",name, "reviews", "***")
    for review in reviews:
        print("\t", review)
    print()

## Task 1
Compute the average polarity of the reviews using `textblob` module (polarity indicates sentiment in [-1.0 (negative), 1.0 (positive)] 0.0 is neutral). Does this seem in line with the average rating for each business?

In [None]:
from textblob import TextBlob

def compute_average_polarity(reviews):
  polarities = []
  for review in reviews:
    blob = TextBlob(review)
    for sentence in blob.sentences:
      polarities.append(sentence.sentiment.polarity)

  return sum(polarities) / len(polarities)

for place, reviews in reviews_dict.items():
  print(place, "average polarity:", compute_average_polarity(reviews))

## Task 2
Visualize the frequencies of words in both sets of business reviews with word clouds using `wordcloud` module. Do the most frequent words in each provide more context for your polarity result?

In [None]:
from wordcloud import WordCloud 
# wordcloud built on matplotlib
import matplotlib.pyplot as plt
def show_wordcloud(name, text):
  wc = WordCloud(colormap="prism", background_color="white")
  wc.generate(text)
  plt.imshow(wc)
  plt.title(name)
  plt.show()
for place, reviews in reviews_dict.items():
  reviews_str = ". ".join(reviews)
  show_wordcloud(place, reviews_str)

## Task 3
Compute readability scores for each set of reviews using `textatistic` module. Compare the Dale-Chall scores, which is a score that can be mapped to grade levels from 4 and below to college graduate (grade 16) and above. This score is considered to be most reliable for a broad range of text types.

What can these scores be used for? (with the small sample size we have we can't make any definitive conclusions...)

In [None]:
from textatistic import Textatistic 

def print_readability_scores(name, text):
  print(name, "readability scores")
  readability = Textatistic(text)
  for stat, value in readability.dict().items():
    print(stat, "->", value)
  print()

for place, reviews in reviews_dict.items():
  reviews_str = ". ".join(reviews)
  print_readability_scores(place, reviews_str)

## Task 4
Perform named entity recognition using `spaCy`. What dates, times, quantities, places, people, things, organizations, do people mention in their reviews of these businesses?

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

for place, reviews in reviews_dict.items():
  print(place)
  reviews_str = ". ".join(reviews)
  document = nlp(reviews_str)
  for entity in document.ents:
    print(entity.text, "->", entity.label_)
  print()

## Task 5
Calculate document similarity using `spaCy` to determine how alike the review sets are in terms of word frequencies, writing styles, etc. Do you hypothesize there are other businesses (but still in this category) that would have reviews that are more or less similar to these? Test your theories out with the API and `spaCy` :)

In [None]:
document1 = nlp(". ".join(reviews_dict["mango+tree"]))
document2 = nlp(". ".join(reviews_dict["aloha+island+grill"]))
print(document1.similarity(document2))
# see notes on github for how to get rid of the warning