# **INFO5731 Assignment Two**

In this assignment, you will try to gather text data from open data source via web scraping or API. After that you need to clean the text data and syntactic analysis of the data.

In [1]:
from collections import Counter
from spacy import displacy
import scrapy
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import Word
import spacy
import nltk
# nltk.download('wordnet') # already downloaded
# load spaCy model
nlp = spacy.load("en_core_web_sm")
pd.set_option('max_colwidth', 60) # adjust column with of dataframe

# **Question 1**

(40 points). Write a python program to collect text data from **either of the following sources** and save the data into a **csv file**:

(1) Collect all the customer reviews of the product [2019 Dell labtop](https://www.amazon.com/Dell-Inspiron-5000-5570-Laptop/dp/B07N49F51N/ref=sr_1_11?crid=1IJ7UWF2F4GHH&keywords=dell%2Bxps%2B15&qid=1580173569&sprefix=dell%2Caps%2C181&sr=8-11&th=1) on amazon.

(2) Collect the top 100 User Reviews of the film [Joker](https://www.imdb.com/title/tt7286456/reviews?ref_=tt_urv) from IMDB.

(3) Collect the abstracts of the top 100 research papers by using the query [natural language processing](https://citeseerx.ist.psu.edu/search?q=natural+language+processing&submit.x=0&submit.y=0&sort=rlv&t=doc) from CiteSeerX.

(4) Collect the top 100 tweets by using hashtag ["#CovidVaccine"](https://twitter.com/hashtag/CovidVaccine) from Twitter. 


In [None]:
# Write your code here
# scraping script modified according to amazon.com

# Creating a new class to implement Spide

class AmazonReviewsSpider(scrapy.Spider):
    # Spider name
    name = 'amazon_reviews'
    # Domain names to scrape
    allowed_domains = ['amazon.com']
    # Base URL for the MacBook air reviews
    start_urls = []
    start_urls.append(
        'https://www.amazon.com/Dell-Inspiron-5000-5570-Laptop/product-reviews/B07N49F51N/ref=cm_cr_othr_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
    )
    myBaseUrl = 'https://www.amazon.com/Dell-Inspiron-5000-5570-Laptop/product-reviews/B07N49F51N/ref=cm_cr_getr_d_paging_btm_next_'
    add_on = '?ie=UTF8&reviewerType=all_reviews&pageNumber='
    # Creating list of urls to be scraped by appending page number a the end of base url
    for i in range(2, 20):
        start_urls.append(myBaseUrl + str(i) + add_on + str(i))
    # Defining a Scrapy parser

    def parse(self, response):
        data = response.css('#cm_cr-review_list')
        # Collecting product star ratings
        star_rating = data.css('.review-rating')
        # Collecting user reviews
        comments = data.css('.review-text')
        count = 0
        # Combining the results
        for review in star_rating:
            yield {
                'stars':
                ''.join(review.xpath('.//text()').extract()),
                'comment':
                ''.join(str(comments[count].xpath(".//text()").extract()))
            }
            count = count + 1
# It saves the reviews to reviews.csv file

In [2]:
df = pd.read_csv('reviews.csv', header=0, names=['rating', 'reviews']) # read the data (reviews) into a data frame
df.head()

Unnamed: 0,rating,reviews
0,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n I purchased th..."
1,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n The item came ..."
2,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n It has already..."
3,4.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n Came in evenin..."
4,2.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n This laptop wa..."


In [3]:
# check if there is any duplicated review
print(f'Number of duplicate reviews: {df[df.duplicated() == True].shape[0]}')

Number of duplicate reviews: 0


In [4]:
# clean the comments
df['reviews_clean'] = df['reviews'].apply(lambda x: x.replace(r"'", ''))
df['reviews_clean'] = df['reviews_clean'].apply(lambda x: x.replace(r'"', ''))
df['reviews_clean'] = df['reviews_clean'].apply(
    lambda x: x.replace(r'[\n\n\n\n\n\n\n\n\n\n  \n  \n    , \n  ', ''))
df['reviews_clean'] = df['reviews_clean'].apply(
    lambda x: x.replace(r'\n, \n  \n]', ''))
df.head()

Unnamed: 0,rating,reviews,reviews_clean
0,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n I purchased th...",I purchased this laptop with upgraded RAM (32GB) and SSD...
1,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n The item came ...",The item came promptly. From the beginning the touch scr...
2,1.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n It has already...",It has already had issues that require repair beyond the...
3,4.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n Came in evenin...",Came in evening around 11pm so in thankful USPS was stil...
4,2.0 out of 5 stars,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n This laptop wa...",This laptop was ok for the price. Lack of USB-C is a pro...


In [8]:
# quantify the ratings
df['rating'].replace(sorted(df['rating'].unique()), value=np.arange(1, 6), inplace=True)
df.head()

Unnamed: 0,rating,reviews,reviews_clean
0,1,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n I purchased th...",I purchased this laptop with upgraded RAM (32GB) and SSD...
1,1,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n The item came ...",The item came promptly. From the beginning the touch scr...
2,1,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n It has already...",It has already had issues that require repair beyond the...
3,4,"['\n\n\n\n\n\n\n\n\n\n \n \n ', ""\n Came in evenin...",Came in evening around 11pm so in thankful USPS was stil...
4,2,"['\n\n\n\n\n\n\n\n\n\n \n \n ', '\n This laptop wa...",This laptop was ok for the price. Lack of USB-C is a pro...


In [9]:
df_clean = df.drop('reviews', axis=1).copy()
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,I purchased this laptop with upgraded RAM (32GB) and SSD...
1,1,The item came promptly. From the beginning the touch scr...
2,1,It has already had issues that require repair beyond the...
3,4,Came in evening around 11pm so in thankful USPS was stil...
4,2,This laptop was ok for the price. Lack of USB-C is a pro...


# **Question 2**

(30 points). Write a python program to **clean the text data** you collected above and save the data in a new column in the csv file. The data cleaning steps include:

(1) Remove noise, such as special characters and punctuations.

(2) Remove numbers.

(3) Remove stopwords by using the [stopwords list](https://gist.github.com/sebleier/554280).

(4) Lowercase all texts

(5) Stemming. 

(6) Lemmatization.

In [10]:
# Write your code here
# lowercase the comments
df_clean['reviews_clean'] = df_clean['reviews_clean'].apply(
    lambda x: ' '.join([x.lower() for x in x.split()]))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,i purchased this laptop with upgraded ram (32gb) and ssd...
1,1,the item came promptly. from the beginning the touch scr...
2,1,it has already had issues that require repair beyond the...
3,4,came in evening around 11pm so in thankful usps was stil...
4,2,this laptop was ok for the price. lack of usb-c is a pro...


In [12]:
# remove punctuations
df_clean['reviews_clean'] = df_clean['reviews_clean'].str.replace('[^\w\s]', '')
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,i purchased this laptop with upgraded ram 32gb and ssd 1...
1,1,the item came promptly from the beginning the touch scre...
2,1,it has already had issues that require repair beyond the...
3,4,came in evening around 11pm so in thankful usps was stil...
4,2,this laptop was ok for the price lack of usbc is a probl...


In [13]:
# remove numbers
numbers = '0123456789'
df_clean['reviews_clean'] = df_clean['reviews_clean'].apply(
    lambda x: ''.join(x for x in list(x) if x not in numbers))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,i purchased this laptop with upgraded ram gb and ssd tb ...
1,1,the item came promptly from the beginning the touch scre...
2,1,it has already had issues that require repair beyond the...
3,4,came in evening around pm so in thankful usps was still ...
4,2,this laptop was ok for the price lack of usbc is a probl...


In [14]:
# remove stopwords
stop = stopwords.words('english')
df_clean['reviews_clean'] = df_clean['reviews_clean'].apply(
    lambda x: ' '.join(x for x in x.split() if x not in stop))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,purchased laptop upgraded ram gb ssd tb arrived near may...
1,1,item came promptly beginning touch screen didnt work cou...
2,1,already issues require repair beyond inhouse guy
3,4,came evening around pm thankful usps still working scrat...
4,2,laptop ok price lack usbc problem especially power conne...


In [15]:
# stemming
st = PorterStemmer()
df_clean['reviews_clean'] = df_clean['reviews_clean'].apply(
    lambda x: " ".join([st.stem(word) for word in x.split()]))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,purchas laptop upgrad ram gb ssd tb arriv near may fan d...
1,1,item came promptli begin touch screen didnt work could l...
2,1,alreadi issu requir repair beyond inhous guy
3,4,came even around pm thank usp still work scratch outsid ...
4,2,laptop ok price lack usbc problem especi power connector...


In [16]:
# spelling correction
from textblob import TextBlob
df_clean['reviews_clean'] = df_clean['reviews_clean'].apply(lambda x: str(TextBlob(x).correct()))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,purchase lawton upgrade ram go sad to arrive near may fa...
1,1,item came promptly begin touch screen didn work could li...
2,1,already issue require repair beyond nous guy
3,4,came even around pm thank up still work scratch outside ...
4,2,lawton ok price lack us problem respect power connection...


In [17]:
# lemmatization
df['reviews_clean'] = df['reviews_clean'].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df_clean.head()

Unnamed: 0,rating,reviews_clean
0,1,purchase lawton upgrade ram go sad to arrive near may fa...
1,1,item came promptly begin touch screen didn work could li...
2,1,already issue require repair beyond nous guy
3,4,came even around pm thank up still work scratch outside ...
4,2,lawton ok price lack us problem respect power connection...


# **Question 3**

(30 points). Write a python program to conduct **syntax and structure analysis** of the clean text you just saved above. The syntax and structure analysis includes: 

(1) Parts of Speech (POS) Tagging: Tag Parts of Speech of each word in the text, and calculate the total number of N(oun), V(erb), Adj(ective), Adv(erb), respectively.

(2) Constituency Parsing and Dependency Parsing: print out the constituency parsing trees and dependency parsing trees of all the sentences. Using one sentence as an example to explain your understanding about the constituency parsing tree and dependency parsing tree.

(3) Named Entity Recognition: Extract all the entities such as person names, organizations, locations, product names, and date from the clean texts, calculate the count of each entity.

## Part 1

In [18]:
def tag_counts(review):
    tokens = nlp(review)
    pos = [token.pos_ for token in tokens]
    counts = Counter(pos)
    return counts

In [19]:
df_clean['tags'] = df_clean['reviews_clean'].apply(tag_counts)
df_clean.head()

Unnamed: 0,rating,reviews_clean,tags
0,1,purchase lawton upgrade ram go sad to arrive near may fa...,"{'PROPN': 13, 'VERB': 16, 'ADJ': 9, 'PART': 1, 'SCONJ': ..."
1,1,item came promptly begin touch screen didn work could li...,"{'NOUN': 13, 'VERB': 10, 'ADV': 2, 'PROPN': 2, 'PART': 1..."
2,1,already issue require repair beyond nous guy,"{'ADV': 1, 'NOUN': 3, 'VERB': 1, 'ADP': 1, 'ADJ': 1}"
3,4,came even around pm thank up still work scratch outside ...,"{'VERB': 11, 'ADV': 6, 'ADP': 5, 'NOUN': 18, 'ADJ': 8, '..."
4,2,lawton ok price lack us problem respect power connection...,"{'PROPN': 2, 'INTJ': 1, 'NOUN': 9, 'VERB': 3, 'PRON': 2,..."


## Part 2

### Constituency Parsing

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
# import nltk
# benepar.download('benepar_en')

In [22]:
from benepar.spacy_plugin import BeneparComponent
nlp.add_pipe(BeneparComponent('benepar_en2'))

AttributeError: module 'tensorflow' has no attribute 'GraphDef'

### Dependency Parsing

In [23]:
def print_dependencies(reviews, times=None):
    if not times:
        times = reviews.shape[0]
    for i in range(times):
        print(f'review {i+1}:')
        print('----------', end='\n\n')
        for token in nlp(reviews[i]):
            print(token.text,'=>',token.dep_,'=>',token.head.text)
        print()

In [24]:
print_dependencies(df_clean['reviews_clean'],2)

review 1:
----------

purchase => compound => lawton
lawton => nsubj => upgrade
upgrade => ROOT => upgrade
ram => dobj => upgrade
go => ROOT => go
sad => acomp => go
to => aux => arrive
arrive => xcomp => sad
near => prep => arrive
may => compound => fan
fan => nsubj => die
die => ROOT => die
more => amod => june
june => dobj => die
one => nummod => year
year => compound => week
week => compound => warrant
purchase => compound => lawton
lawton => compound => warrant
warrant => dobj => die
or => cc => warrant
or => cc => die
ran => conj => die
diagnose => nsubj => said
said => ROOT => said
warrant => compound => start
start => nsubj => end
end => ccomp => said
december => npadvmod => end
well => advmod => december
want => ROOT => want
charge => compound => support
support => nsubj => ask
ask => ccomp => want
local => amod => supervisor
chat => compound => supervisor
supervisor => nsubj => said
said => ROOT => said
one => nummod => weekly
work => compound => weekly
weekly => nsubj => buy

In [25]:
lengths = df_clean['reviews_clean'].apply(lambda x: len(x))
moderate = list(np.argsort(lengths))[15]
displacy.render(nlp(df_clean['reviews_clean'][moderate]), jupyter=True)

### Part 3

In [26]:
def print_entities(reviews, times=None):
    if not times:
        times = reviews.shape[0]
    for i in range(times):
        print(f'review {i+1}:')
        print('----------', end='\n\n')
        for ent in nlp(reviews[i]).ents:
            print(ent.text, '-->', ent.label_)
        print()

In [27]:
print_entities(df_clean['reviews_clean'])

review 1:
----------

lawton --> PERSON
june one year week --> DATE
lawton --> PERSON
december --> DATE
one --> CARDINAL
weekly --> DATE
lawton year --> DATE
first year --> DATE
one --> CARDINAL
every year --> DATE
first --> ORDINAL
one --> CARDINAL

review 2:
----------

amazon --> ORG

review 3:
----------


review 4:
----------

amazon --> ORG

review 5:
----------

lawton --> ORG
us --> GPE

review 6:
----------


review 7:
----------

lawton month half --> DATE
second --> ORDINAL
one --> CARDINAL
touchscreen --> CARDINAL
amazon --> ORG

review 8:
----------


review 9:
----------

year one --> DATE

review 10:
----------

lawton random shut --> PERSON
february --> DATE
oct --> CARDINAL

review 11:
----------

sister cook --> PERSON
new lawton --> GPE
lawton --> ORG
lawton cobb --> PERSON
space million --> CARDINAL

review 12:
----------


review 13:
----------

far week --> DATE
one --> CARDINAL

review 14:
----------

lawton --> PERSON
east board --> ORG

review 15:
----------

o

**Write your explanations of the constituency parsing tree and dependency parsing tree here (Question 3-2):** 

In [None]:
'''
Write your explanations of the constituency parsing tree and dependency parsing tree here



'''