In [1]:
import re
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import words, stopwords
from nltk.corpus import brown
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from bs4 import BeautifulSoup
from collections import Counter
import requests
import os
import time
import pandas as pd

# Homework 5

### Compile a list of static links (permalinks) to individual user movie reviews from one particular website. This will be your working dataset for this assignment, as well as for assignments 7 and 8, which together will make up your semester project.   

### It does not matter if you use a crawler or if you manually collect the links, but you will need at least 100 movie review links. Note that, as of this writing, the robots.txt file of IMDB.com allows the crawling of user reviews.

### Each link should be to a web page that has only one user review of only one movie, e.g., the user review permalinks on the IMDB site.

### Choose reviews of movies that are all in the same genre, e.g., sci-fi, mystery, romance, superhero, etc.  

### Make sure your collection includes reviews of several movies in your chosen genre and that it includes a mix of negative and positive reviews.  

In [4]:
class PageScraper:
    def __init__(self, url='https://www.rogerebert.com/reviews/page/2'):
        self.urlbase = url
        self.download_template = "https://www.rogerebert.com/reviews/{}"
        self.folder = "reviews"
        if self.folder not in os.listdir():
            os.mkdir(self.folder)
        self.pages = {}
        
    def get_page_names(self):
        if "base_page" not in os.listdir(self.folder):
            main_page = str(requests.get(self.urlbase).text)
        else:
            main_page = open(self.folder + "/base_page")
        page_soup = BeautifulSoup(main_page, 'html.parser')
        
#         for i in page_soup.find_all("figcaption"):
        for i in page_soup.find_all("figure", attrs={'class': 'movie review'}):
            try:
                href = i.find_all('a')[1]['href'].split('/')[2]
                rating = len(i.find_all("i", attrs={'class': 'icon-star-full'})) + .5 *\
                                    len(i.find_all("i", attrs={'class': 'icon-star-half'}))
                self.pages["{}__{}".format(href, rating)] = \
                                self.download_template.format(href)
            except IndexError:
                continue
                
    def save_page(self, contents, bookname):
        open(self.folder + "/" + bookname, "w").write(contents)
    
    def get_all_pages(self, sleep_time=1.5):
        for pagename, pageurl in self.pages.items():
            if pagename not in os.listdir(self.folder):
                contents = str(requests.get(pageurl).text)
                self.save_page(contents, pagename)
                time.sleep(sleep_time)
            
    def check_genre(self, genre="Comedy"):
        for i in os.listdir(self.folder):
            page = open("{}/{}".format(self.folder, i)).read()
            page_soup = BeautifulSoup(page, "html.parser")
            genre_tag = page_soup.find("p", attrs={'class':'genres'})
            try:
                if genre not in genre_tag.getText():
                    os.remove("{}/{}".format(self.folder, i))
            except AttributeError as e:
                os.remove("{}/{}".format(self.folder, i))

In [3]:
Scraper = PageScraper()
Scraper.get_page_names()
Scraper.get_all_pages()
# Scraper.check_genre()

In [334]:
pg = 2

In [337]:
while True:
    Scraper = PageScraper(url='https://www.rogerebert.com/reviews/page/{}'.format(pg))
    Scraper.get_page_names()
    Scraper.get_all_pages()
    Scraper.check_genre()
    if len(os.listdir(Scraper.folder)) > 130:
        break
    else:
        pg += 1

### Extract noun phrase (NP) chunks from your reviews using the following procedure:

### In Python, use BeautifulSoup to grab the main review text from each link.  



### Next run each review text through a tokenizer, and then try to NP-chunk it with a shallow parser. 



### You probably will have too many unknown words, owing to proper names of characters, actors, and so on that are not in your working dictionary. Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again.



### Output all the chunks in a single list for each review, and submit that output for this assignment. Also submit a brief written summary of what you did (describe your selection of genre, your source of reviews, how many you collected, and by what means).

In [5]:
from nltk.tree import Tree
from nltk.chunk import conlltags2tree, tree2conlltags, ne_chunk

In [6]:
## Some code borrowed from https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
## to build a Chunker 

from nltk.corpus import conll2000

data = conll2000.chunked_sents()
train_data = data

from nltk.chunk.util import tree2conlltags, conlltags2tree

wtc = tree2conlltags(train_data[1])

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff 

from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

# define the chunker class
class NGramTagChunker(ChunkParserI):
    
    def __init__(self, train_sentences, 
        tagger_classes=[UnigramTagger, BigramTagger]):
        train_sent_tags = conll_tag_chunks(train_sentences)
        self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

    def parse(self, tagged_sentence):
        if not tagged_sentence: 
            return None
        pos_tags = [tag for word, tag in tagged_sentence]
        chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
        chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
        wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                         in zip(tagged_sentence, chunk_tags)]
        return conlltags2tree(wpc_tags)
  
# train chunker model  
ntc = NGramTagChunker(train_data)

# evaluate chunker model performance

In [8]:
## Deploy the Chunker
chunker = NGramTagChunker(train_data)

for review in os.listdir("reviews")[0]:
    noun_parts = []
#     print(i)
    page = open("reviews/{}".format(review)).read()
    page_soup = BeautifulSoup(page, "html.parser")
    item = page_soup.find("div", attrs={'itemprop':'reviewBody'})
    review_text = ' '.join(subitem.get_text().strip() for subitem in item.find_all('p')).strip()
    review_text = re.sub(" Advertisement ", '', review_text)
    for sent in sent_tokenize(review_text):
        sent = re.sub("[^a-zA-z0-9 \-\']", '', sent)
        tokens = word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)
        entities = chunker.parse(tagged)
        noun_part = ''
        for token in entities:
            try:
                if token.label() == "NP":
                    try:
                        noun_part += " " + tuple(token)[0]
                    except TypeError:
                        noun_part += " " + " ".join([j[0] for j in tuple(token)])
                else:
                    if len(noun_part) > 0:
                        noun_parts.append(noun_part)
                        noun_part = ''
            except AttributeError:
                if len(noun_part) > 0:
                    noun_parts.append(noun_part)
                    noun_part = ''
            print(noun_part)
    open("verb_parts/{}".format(review), 'w').write("\n".join(noun_parts))

In [9]:
## List all books which noun parts were extracted from
all_books = os.listdir("noun_parts")

In [533]:
rankings = pd.Series({book.split('__')[0]: book.split('__')[1] for book in all_books}).astype(float)

In [520]:
## Looking at the most poorly rated comedy movies
rankings.sort_values()[:10]

holmes-and-watson-2018                        0.5
flower-2018                                   0.5
show-dogs-2018                                0.5
the-last-sharknado-its-about-time-2018        1.0
sierra-burgess-is-a-loser-2018                1.0
a-happening-of-monumental-proportions-2018    1.0
arizona-2018                                  1.5
the-con-is-on-2018                            1.5
welcome-to-marwen-2018                        1.5
overboard-2018                                1.5
dtype: object

In [526]:
## Using the title "Holmes-and-watson-2018" which got a 0.5 star rating
open('noun_parts/holmes-and-watson-2018__0.5').read().split("\n")[50:65]

[' the character',
 ' Some',
 ' these films',
 ' goodBilly Wilders',
 ' ambitious The Private Life',
 ' Sherlock Holmes',
 ' the brilliant',
 ' little-seen cult classic Zero Effectand some',
 ' them such',
 ' Gene Wilders The Adventures',
 ' Sherlock Holmes Smarter Brother',
 ' the Michael Caine romp',
 ' a Clue',
 ' all',
 ' those cases good']

In [528]:
## Looking at the most highly rated Comedy Movies
rankings.sort_values(ascending=False)[:10]

let-the-sunshine-in-2018                              4.0
private-life-2018                                     4.0
the-ballad-of-buster-scruggs-2018                     4.0
blindspotting-2018                                    4.0
blackkklansman-2018                                   4.0
the-other-side-of-the-wind-2018                       4.0
a-bread-factory-part-two-walk-with-me-a-while-2018    4.0
a-bread-factory-part-one-2018                         4.0
the-old-man-and-the-gun-2018                          3.5
spider-man-into-the-spider-verse-2018                 3.5
dtype: object

In [529]:
open('noun_parts/let-the-sunshine-in-2018__4.0').read().split("\n")[50:65]

[' who-slept-with-who gossip',
 ' gallery owners',
 ' Vincent',
 ' spectacular relish',
 ' Xavier Beauvois the actor',
 ' filmmaker best',
 ' 2010s',
 ' Gods',
 ' Men',
 ' a real piece',
 ' work arrogant',
 ' end a',
 ' pedantic',
 ' whiskey snob theyre worse',
 ' wine snobs']

In [None]:
#Lets see if there's a clear distinction between the nouns used in reviews of highly rated movies vs poorly rated movies
good_movies = (rankings[rankings > 3.0].index + "__" + rankings[rankings > 3.0].astype(str)).values

In [548]:
pos_lines= []
for mov in good_movies:
    nps = open('noun_parts/{}'.format(mov)).read().strip().split("\n")
    for line in nps:
        pos_lines.append([mov, line])

pos_lines = pd.DataFrame(all_lines, columns=['movie_name', "review_line"])

In [564]:
neg_lines= []
for mov in bad_movies:
    nps = open('noun_parts/{}'.format(mov)).read().strip().split("\n")
    for line in nps:
        neg_lines.append([mov, line])

neg_lines = pd.DataFrame(all_lines, columns=['movie_name', "review_line"])

In [571]:
## Print out positive review line on the left, negative review lines on the right
for g,b in zip(pos_lines['review_line'].sample(25).values, neg_lines.sample(25)['review_line'].values):
    print(g, "  --  ", b)

 the easier task he   --    who we
 the tourists   --    aesthetics
 a collision course that   --    this attractive woman
 the Queens chamber   --    It
 Im   --    mutual aggravation
 them   --    hes gone
 a middle finger   --    There
 Both Adam   --    Elliots warm
 themand it   --    the array
 an intimate home   --    scene this
 Spacek thats impossible   --    a special breed
 the music he   --    a man
 beat   --    writer
 the climax unfolds   --    an
 a tender hand   --    Everything
 celebrity   --    more moved
 The figures   --    insects who
 the challenge   --    enormous
 the morningand   --    stylistic flourishes
 The town   --    it
 citation   --    it
 Gjonola sings My Bathroomthe song she   --    Miles
 It   --    the reality
 a social media stalker Parinaz Izadyar who   --    real people
 all the way   --    we


The positive review lines are in general much more descriptive then the negative review lines, and they also seem to be more likley to mention character names in the review.

* Since the 135 reviews included in this project are quite large, and the scraper has been optimized to avoid triggering a ddos attack warning, this code base takes quite long to run.  The html documents and review contents can be found on my github at https://github.com/tdeason416/DS337/tree/master/Week7