# Text Analysis on Filtered Dataframe (Mountain Pine Beetle Example)

In [None]:
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# read in your CSV
parksurvey_df = pd.read_csv("sksurveys.csv",  delimiter=",")

# replace "nan" floats with strings to prevent errors when filtering information
parksurvey_df = parksurvey_df.fillna("Unknown")

parksurvey_df

In [None]:
# extract list of unique parks which we will use to filter dataframe
list_of_parks = list(parksurvey_df["park_visited"].unique())
list_of_parks

In [None]:
for park in list_of_parks:
    # get rows where park_visited is the selected park
    currpark_df = parksurvey_df.loc[parksurvey_df["park_visited"] == park]
    into_text = ""

    # merge selected column into a text for textual analysis
    for i in currpark_df["enjoys_most"]:
        print(i)
        into_text += i + "\n"
    
    print(into_text)

# should you want to find who said what post analysis, use phrases outputted by concordance analysis to match back into larger dataset that includes demographics

# Webscraping Examples

## How to be a Good Webscraper (aka don't get blacklisted!)

First and foremost, ensure that there is no clause on the website(s) you're planning to scrape that expilicitly state that it cannot be legally scraped and/or using the information provided by the website outside of it is a violation (this information can often be found in the Terms of Service). 

If that is clear, then essentially, to be an ethical webscraper you want your scraping process to imitate a human interacting with the website as much as possible, since that is how most websites are designed to be used! There are a few main ways this can be achieved:
- **Wait times**: If you scrape a website too aggressively-- for example, asking a website for information or a new link very quickly one after another-- you risk overloading their servers (particularly an issue for older or smaller/more niche websites) which can cause the site to crash, or more likely, you to be blacklisted from accessing the website because you will be flagged as malicious/a bot. To combat this, we can add small wait times between our requests for information; these pauses give the servers a break!  
- **Automate actions**: If possible, consider how you would interact with the webpage you want to scrape, and try imitating those steps through your code (discussed further in "Selenium").
- **Use "Inspect Element" to analyse HTML**: Rather than request the entire HTML content of a webpage in your code to find the information you want to extract (hence, making more requests to their servers), look at the webpage's HTML through your browser's "Inspect" or "Inspect Element" tool!


## BeautifulSoup

BeautifulSoup is a simple Python library that allows you to read the HTML content of a provided webpage into your code, allowing you to extract information present in it.
Here are some examples of how the HTML of your page can be searched through and extracted from: https://beautiful-soup-4.readthedocs.io/en/latest/index.html#searching-the-tree

In [None]:
# scraping articles from news website

# import two built in Python libraries:
# requests allows you to request information from a website's server
# sleep allows use to pause our code to give the servers a break
import requests
from time import sleep

from bs4 import BeautifulSoup
import pandas as pd

# let's just get the first 15pgs of results from our search for now --> the following for loop imitates clicking "next page"
article_links = []
for i in range(1, 16):
    # the URL we are requesting information from
    url = "https://betakit.com/query/fintech/page/" + str(i)

    # requesting the information --> "user agent" is telling the server to respond as if the request was coming from a Mozilla (Firefox) browser
    req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})

    # now we take a brief pause before continuing to manipulate the requested data
    sleep(5)
    html = req.text

    # now we use BeautifulSoup to parse the extract HTML and create a "soup" object that can be used to search and find the exact information we want
    soup = BeautifulSoup(html, features="html.parser")

    # to extract the link to each article, I found the HTML element using the "Inspect" tool and I am selecting all results which match that element using CSS
    links = soup.select("h2.entry-title > a")

    # now, I am adding each link I extracted to a list that will be ongoing, capturing the article links from all 15pgs
    for a in links:
        article_links.append(a['href'])

    print("On page " + str(i) + "! Sleeping to next page...")

In [None]:
# view list of article links --> there are 15 articles on each page so 15*15 means I should have 225 links
article_links

In [None]:
# now we can extract some metadata and content of each article and organize this information to a dataframe

# create an empty dataframe with just the column headers
articles_df = pd.DataFrame(columns=['author', 'date', 'title', 'content', 'tags'])

for article in article_links:
    # request information from the article webpage, just like we did before with the search page
    url = article
    req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    html = req.text

    sleep(10)
    soup = BeautifulSoup(html, features="html.parser")

    # more information on the search + extract from HTML methods provided by BeautifulSoup in the link above
    author = soup.find("a", "author url fn").get_text()
    date = soup.find("span", "entry-date").get_text()
    article_title = soup.find("h1", "entry-title").get_text().strip()
    article_content = soup.select("article > p")[0].get_text().strip()
    tags = soup.select("div#tags-box > a")

    # put info extracted from article into a list
    new_article_row = [author, date, article_title, article_content, tags]
    # add list to our dataframe as new row
    articles_df.loc[len(articles_df)] = new_article_row  



In [None]:
# view our completed dataframe!
articles_df

## Selenium

[Selenium](https://selenium-python.readthedocs.io/) is a more advanced tool designed for automating web browsers through the use of **WebDrivers**; this essentially means the Selenium creates its own browser window from the browser you tell it to use, and in this window it performs tasks which you tell it to do through your code. This does mean that you shouldn't interact with the browser window Selenium creates, unless you want to risk messing up its scraping process! 

Selenium is often used in conjunction with BeautifulSoup, where Selenium performs an action that is necessary to make the information you need appear, and once that information has appeared you can use BeautifulSoup as usual.

In the following example, the webpage requires the user to search a location in a text box in order for the information on that location to appear, which necessitates the use of Selenium to automate scraping information from this site.

In [None]:
# example locations to search
municipalities = ['Capixaba - AC',
                'Cruzeiro Do Sul - AC',
                'Cacimbinhas - AL']

In [None]:
import requests
from time import sleep

from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [None]:
# we will select the Chrome WebDriver for Selenium to use in its window
driver = webdriver.Chrome()

# create an empty dataframe with just the column headers
descargas_df = pd.DataFrame(columns=['cidade/uf', 'densidade_descargas', 'rank_densidade_n', 'rank_densidade_e'])

for m in municipalities:
    # we want to open this webpage with our WebDriver in order interact with it 
    driver.get("http://www.inpe.br/webelat/homepage/#")
    
    # Through the "Inspect" tool (in my own browser window, not the Selenium generated one!) I identify the HTML element which Selenium needs to interact with
    # an input text field
    inputElement = driver.find_element("id", "input_ranking")
    # the send_keys(m) command will enter the current selected municipality into the input text field identified in the previous line of code
    inputElement.send_keys(m)
    # this simulated you hitting the "enter" key and reveals the information we want
    inputElement.send_keys(Keys.ENTER)

    sleep(5)
    soup = BeautifulSoup(driver.page_source,"html.parser")
    
    # now we can extract the information from the HTML element that was uncovered after searching for it using Selenium
    scraped_tbl = soup.find(id="divRanking").get_text()
    # since it is extracted from HTML, this information is in an odd format so we'll put it into a list to make it more usable
    scraped_tbl = scraped_tbl.split('\n')
    
    # we just want the information following the ":" in each extracted row of information
    densidade_info = []
    for i in scraped_tbl:
        if len(i) > 1 and ':' in i:
            i = i.strip().split(':')[1]
            densidade_info.append(i)

    print(densidade_info)

    # now we can add the list of infomation as a new row in our dataframe
    descargas_df.loc[len(descargas_df)] = densidade_info  
    print('-------')

# this closes the Selenium window
driver.quit()

In [None]:
# view your completed dataframe!
descargas_df