In [15]:
from urllib.request import Request, urlopen
import pandas as pd
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# - Matplotlib library
# - Test-Driven Development


# 2. Ensure to have two spaces exactly between import and actual codes in modules

# ## Github Convention

# 1. Create a branch named `implement-web-analysis` and only work within this branch
# 3. When done, push your changes and raise a pull request on Github using the pull request template already added to the project (`.github` folder)

# - Try to not attempt to conceptualize the whole documentation and codebase at once, instead move step by step through the task
# - Ensure to understand the problem before attempting to write any code
# - Ensure to write the expected unit tests first before actual implementation, as that confirms that you are conforming to the `TDD Methodology`.
# - Ensure to manually experiment within the modules to confirm the results of your implementations


# ```bash
# Machine>> cd <this-project-folder>

# Machine>> python -m unittest tests
# ```

# ## Task Analysis and Design

# We need to lay out a design of what the final program should look
# like, as well as how it should function. For testing purposes, we'll use Python's home page. Eventually, we'll want the final output to look like...

# ## Pie Plot

# ![Pie Plot Analysis](assets/pie_plot.png)

# ## Bar Plot

# ![Web Analysis](assets/bar_chart.png)

# ## Step by step Analysis

# - We're going to make the program continually ask the users if they'd like to scrape a web site <br/>
# - Accept the users' input for the site they'd like to analyze.
# - After that, we can filter out all information that isn't useful like
#   - All non-text elements, such as scripts, comments, etc.
#   - All common article words and useless characters like newlines characters, empty spaces and tabs
#   - 1. Create a bar plot.
#   - 2. Create a pie plot

# The program output should look like the following:

# >>> The top word is: python
# >>> *** show bar plot ***
# ```



In [3]:


class TomideBeautifulSoupUtils:
    def __init__(self, url, type, scroll):
        self.url = url
        self.type = type
        self.scroll = scroll
    
    @staticmethod
    def get_classes(soup):
        class_list = []
        tags = {tag for tag in soup.find_all()}
        for tag in tags:
            if tag.has_attr( "class" ):
                if len( tag['class'] ) != 0:
                    if tag['class'][0] not in class_list:
                        class_list.append( tag['class'][0])
        return class_list

    def tomide_bs4_make_soup(url, type, scroll):
        if type == "static":
            req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urlopen(req).read()
            return BeautifulSoup(webpage, 'html.parser')

        elif type == "incognito":
            driver = webdriver.Chrome()
            driver.get(url)
            if scroll == True:
                SCROLL_PAUSE_TIME = 10
                last_height = driver.execute_script("return document.body.scrollHeight")   # Get scroll height
                while True:
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom
                    time.sleep(SCROLL_PAUSE_TIME)  # Wait to load page
                    new_height = driver.execute_script("return document.body.scrollHeight") # Calculate new scroll height and compare with last scroll height
                    if new_height == last_height:
                        break
                    last_height = new_height
            driver.quit()
            return BeautifulSoup(driver.page_source, 'html.parser')
        else: 
            options = webdriver.ChromeOptions()
            options.add_argument("user-data-dir=/Users/tomideisawesome/Library/Application Support/Google/Chrome")
            driver = webdriver.Chrome(options=options)
            driver.get(url)
            driver.quit()
            return BeautifulSoup(driver.page_source, 'html.parser')


In [8]:
def program ():
    while True:
        user_input = input("Would you like to scrape a website (y/n)? ")
        if user_input.lower() == "y":
            url = input("Enter a website to analyze: ")
            soup = TomideBeautifulSoupUtils.tomide_bs4_make_soup(url, "static", False)
            return soup.get_text().split()
            break
        elif user_input.lower() == "n":
            print("Thanks for analyzing! Come back again!")
            break
        else:
            print("Invalid input, please try again.")
            continue

common_words = [    "and",    "from", "it",    "seen",    "after",    "with",    "more",    "our",    "they",    "any",    "you",    ">>>",    "your",    "was",    "his",    "been",    "thanks",    "now",    "for",    "the",    "has",    "would",    "new", "with",    "this",    "that",]
extracted_content = program()
extracted_content = [item for item in extracted_content if item not in common_words]
print(extracted_content)

['Welcome', 'to', 'Python.org', 'Notice:', 'While', 'JavaScript', 'is', 'not', 'essential', 'website,', 'interaction', 'content', 'will', 'be', 'limited.', 'Please', 'turn', 'JavaScript', 'on', 'full', 'experience.', 'Skip', 'to', 'content', '▼', 'Close', 'Python', 'PSF', 'Docs', 'PyPI', 'Jobs', 'Community', '▲', 'The', 'Python', 'Network', 'Donate', '≡', 'Menu', 'Search', 'This', 'Site', 'GO', 'A', 'A', 'Smaller', 'Larger', 'Reset', 'Socialize', 'Facebook', 'Twitter', 'Chat', 'on', 'IRC', 'About', 'Applications', 'Quotes', 'Getting', 'Started', 'Help', 'Python', 'Brochure', 'Downloads', 'All', 'releases', 'Source', 'code', 'Windows', 'macOS', 'Other', 'Platforms', 'License', 'Alternative', 'Implementations', 'Documentation', 'Docs', 'Audio/Visual', 'Talks', "Beginner's", 'Guide', "Developer's", 'Guide', 'FAQ', 'Non-English', 'Docs', 'PEP', 'Index', 'Python', 'Books', 'Python', 'Essays', 'Community', 'Diversity', 'Mailing', 'Lists', 'IRC', 'Forums', 'PSF', 'Annual', 'Impact', 'Report',