### Zepeng Wang UID: 905644572
### Yuanzhuo Zhang UID: 106119002

In [1]:
# Import Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[3]}

In [2]:
def lovely_soup(u):
    page = requests.get(u)
    return(BeautifulSoup(page.content, 'html.parser'))

# Q1.) Web Crawling Tables

### Q1.A.) Create a list of links for all the wikipedia pages for NYSE traded companies A-Z and 0-9

In [3]:
URL = "https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(A)"


In [4]:
import string
base_url = "https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_("
characters = list(string.ascii_uppercase) + ["0-9"]
urls = [base_url + character + ")" for character in characters]
for url in urls[:5]:
    print(url)

https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(A)
https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(B)
https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(C)
https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(D)
https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(E)


In [5]:
urls[-1]

'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(0-9)'

### Q1.B.) Crawl through all the URLs and make 1 DF with all the NYSE publically traded companies

In [6]:
merged_list = []
for URL in urls:
    html = requests.get(URL).content
    df_list = pd.read_html(html)
    merged_list.extend(df_list)

In [7]:
merged_df = pd.concat(merged_list, ignore_index=True)
merged_df

Unnamed: 0,Stock name,Symbol,Country of origin
0,A. O. Smith Corporation,AOS,US
1,"A10 Networks, Inc.",ATEN,US
2,AAC Holdings Inc.,AAC,US
3,AAR Corporation,AIR,US
4,Aaron's Inc.,AAN,US
...,...,...,...
2726,Zurn Elkay Water Solutions Corporation,ZWS,United States
2727,10X Capital Venture Acquisition Corp. III,VCXB,United States
2728,10X Capital Venture Acquisition Corp. III,VCXB.U,United States
2729,3D Systems Corporation,DDD,United States


### Q1.C.) What is the percetages of companies that contain 1 letter, 2 letters, 3 letters, 4 letters, 5 letters,... in the ticker (drop punctuation)?

In [8]:
import re

In [9]:
# Create a dictionary to store the count of letters in each ticker
letter_count = {}

for symbol in merged_df['Symbol']:
    # Remove punctuation using regular expression and convert to lowercase
    symbol = re.sub(r'\W+', '', symbol.lower())
    # Calculate the number of letters in the Symbol
    letter_length = len(symbol)
    # Update the count of letters
    if letter_length in letter_count:
        letter_count[letter_length] += 1
    else:
        letter_count[letter_length] = 1

In [10]:
# Calculate the percentage of symbols for each letter count
total_symbols = sum(letter_count.values())
percentage_dict = {key: (value / total_symbols) * 100 for key, value in letter_count.items()}

In [11]:
# Print the results
print("Percentage of companies by ticker length:")
for length, percentage in sorted(percentage_dict.items()):
    print(f"{length} letter(s): {percentage:.2f}%")

Percentage of companies by ticker length:
1 letter(s): 0.84%
2 letter(s): 6.81%
3 letter(s): 63.27%
4 letter(s): 16.55%
5 letter(s): 1.83%
6 letter(s): 9.12%
7 letter(s): 1.54%
8 letter(s): 0.04%


# Q2.) Web Scraping Using Beautiful Soup

### Q2.A.) Using Beautiful soup .findAll method you will webscrape the front page of Reddit. Get a list of all of the "timestamps"

In [12]:
URL = "https://www.reddit.com"

In [13]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# Using Selenium to open a web browser
driver = webdriver.Chrome()

# Open the Reddit webpage
driver.get("https://www.reddit.com")

# Wait for the page to load
driver.implicitly_wait(10)  # Wait for 10 seconds

# Simulate scrolling down the page to load more content
for i in range(10):  # Assume scrolling down 10 times
    driver.find_element("tag name", 'body').send_keys(Keys.PAGE_DOWN)
    time.sleep(1)  # Wait for 1 second to allow the page to load more content

# Get the page content
html_content = driver.page_source

# Close the browser
driver.quit()

# Process the page content...

In [14]:
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class or tag name containing timestamps

timestamps = []
time_tags = soup.findAll('time')


for time_tag in time_tags:
    timestamp = time_tag['title'] 
    timestamps.append(timestamp)

print(timestamps)

['Monday, April 22, 2024 at 8:43:05 AM PDT', 'Monday, April 22, 2024 at 10:04:32 AM PDT', 'Sunday, April 21, 2024 at 5:44:00 PM PDT', 'Monday, April 22, 2024 at 10:45:27 AM PDT', 'Sunday, April 21, 2024 at 8:55:26 PM PDT', 'Monday, April 22, 2024 at 8:32:54 AM PDT', 'Monday, April 22, 2024 at 1:07:41 AM PDT', 'Monday, April 22, 2024 at 9:14:45 AM PDT', 'Monday, April 22, 2024 at 7:57:45 AM PDT', 'Monday, April 22, 2024 at 8:32:36 AM PDT', 'Monday, April 22, 2024 at 10:17:58 AM PDT', 'Monday, April 22, 2024 at 1:32:17 AM PDT', 'Monday, April 22, 2024 at 7:05:28 AM PDT', 'Monday, April 22, 2024 at 1:53:34 AM PDT', 'Monday, April 22, 2024 at 10:14:25 AM PDT', 'Monday, April 22, 2024 at 12:15:10 AM PDT', 'Monday, April 22, 2024 at 6:03:25 AM PDT', 'Monday, April 22, 2024 at 7:55:14 AM PDT', 'Monday, April 22, 2024 at 6:53:14 AM PDT', 'Monday, April 22, 2024 at 3:54:34 AM PDT', 'Monday, April 22, 2024 at 5:55:37 AM PDT', 'Monday, April 22, 2024 at 3:00:08 AM PDT', 'Monday, April 22, 2024 at

### Q2.B.) Using the functions findChild, descendents, etc. locate the post title, text and post time into a dataframe.

In [15]:
# Step 3: Locate the elements containing the post title, text, and post time
post_titles = []
post_texts = []
post_times = []

for post in soup.find_all('article'):
    # Find post title
    title_element = post.findChild('faceplate-screen-reader-content')
    if title_element:
        post_titles.append(title_element.text.strip())
    else:
        post_titles.append(None)
    
    # Find post text
    text_element = post.findChild('p')
    if text_element:
        post_texts.append(text_element.text.strip())
    else:
        post_texts.append(None)
    
    # Find post time
    time_element = post.findChild('time')
    if time_element:
        post_times.append(time_element['title'])
    else:
        post_times.append(None)

In [16]:
print(len(post_titles),len(post_texts),len(post_times))

28 28 28


In [17]:
# Step 4: Store the extracted data into a dataframe
data = {'Title': post_titles, 'Text': post_texts, 'Time': post_times}
df = pd.DataFrame(data)

# Step 5: Display the dataframe
df

Unnamed: 0,Title,Text,Time
0,Military Dog Requests And Gets 30 Seconds Leav...,Welcome! /r/MadeMeSmile is a place to share th...,"Monday, April 22, 2024 at 8:43:05 AM PDT"
1,One of the vent holes on my HP laptop is drill...,"Aww, cripes. I didn't know I'd have to write a...","Monday, April 22, 2024 at 10:04:32 AM PDT"
2,I asked my friend if they would be interested ...,jugkfmghgug,"Sunday, April 21, 2024 at 5:44:00 PM PDT"
3,Found on r/tragedeigh. This is fucking brutal.,Did you stumble across a unique insult? Lookin...,"Monday, April 22, 2024 at 10:45:27 AM PDT"
4,i had an affair and don’t want my husband to k...,You know how there are people who are genuinel...,"Sunday, April 21, 2024 at 8:55:26 PM PDT"
5,Caught the bastard who's been chewing my bird ...,jugkfmghgug,"Monday, April 22, 2024 at 8:32:54 AM PDT"
6,This is real.,/r/facepalm - please sir can I have some more?,"Monday, April 22, 2024 at 1:07:41 AM PDT"
7,"After The Simpsons episode ""Who Shot Mr. Burns...",For the most interesting things on the internet,"Monday, April 22, 2024 at 9:14:45 AM PDT"
8,Imagine seeing this majestic creature in the w...,I bet you will /r/BeAmazed! \nA place ...,"Monday, April 22, 2024 at 7:57:45 AM PDT"
9,Would you leave your partner of two years for ...,Subreddit for listeners of the Two Hot Takes P...,"Monday, April 22, 2024 at 8:32:36 AM PDT"


# Q3.) RegEx

### Q3.A.) Using RegEx, get all the urls of ladder faculty profiles for UCLA Economics

In [18]:
URL = "https://economics.ucla.edu/faculty/ladder"

In [19]:
import re
page = requests.get(URL)
html_content = page.text

In [20]:
# Regex pattern to match URLs of ladder faculty profiles
pattern = r'<a href="https://economics\.ucla\.edu/person/[a-zA-Z0-9\-]+/">'

# Extracting URLs using regex
urls = set(re.findall(pattern, html_content))

# Printing all unique URLs
for url in urls:
    print(url)

<a href="https://economics.ucla.edu/person/natalie-bau/">
<a href="https://economics.ucla.edu/person/john-asker/">
<a href="https://economics.ucla.edu/person/adriana-lleras-muney/">
<a href="https://economics.ucla.edu/person/yotam-shem-tov/">
<a href="https://economics.ucla.edu/person/andrew-atkeson/">
<a href="https://economics.ucla.edu/person/hugo-hopenhayn/">
<a href="https://economics.ucla.edu/person/michael-rubens/">
<a href="https://economics.ucla.edu/person/jonathan-vogel/">
<a href="https://economics.ucla.edu/person/maurizio-mazzocco/">
<a href="https://economics.ucla.edu/person/michela-giorcelli/">
<a href="https://economics.ucla.edu/person/zhipeng-liao/">
<a href="https://economics.ucla.edu/person/rosa-liliana-matzkin/">
<a href="https://economics.ucla.edu/person/jay-lu/">
<a href="https://economics.ucla.edu/person/denis-chetverikov/">
<a href="https://economics.ucla.edu/person/moritz-meyer-ter-vehn/">
<a href="https://economics.ucla.edu/person/david-baqaee/">
<a href="https:

### Q3.B.) Webcrawl the links from A and use RegEx to get all the emails and phone numbers of ladder faculty profiles

In [21]:
# Function to extract emails and phone numbers from text using regex
def extract_emails(url):
    soup = lovely_soup(url)
    text = soup.get_text()
    
    # Regular expressions to match email addresses and phone numbers
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    if emails:
        print(emails)
    else:
        emails = None
    return emails

In [22]:
extract_emails("https://economics.ucla.edu/person/john-asker/")

['johnasker@econ.ucla.edu']


['johnasker@econ.ucla.edu']

In [23]:
import numpy as np
def extract_phones(url):    
    soup = lovely_soup(url)
    personal_website = soup.find('p', class_='website')
    
    if personal_website:
        website_link = personal_website.find('a')['href']
        person_soup = lovely_soup(website_link)
        phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
        text = person_soup.get_text()
        phones = re.findall(phone_pattern, text)
        if phones:
            print(phones)
        else:
            phones = None
    else:
        print("Personal website link not found on the page.")
        phones = None
        website_link = None
    
    return phones, website_link

In [24]:
extract_phones("https://economics.ucla.edu/person/john-asker/")

(None, 'http://www.johnasker.com')

In [25]:
def crawl_and_extract(url):
    data = []
    emails = extract_emails(url)
    phones, website_link = extract_phones(url)
    soup = lovely_soup(url)
    name = soup.find('h1').get_text()
    data.append({'Name': name, 'Emails': emails, 'Phone Numbers': phones, 'URL':website_link })
    
    return data

In [26]:
crawl_and_extract("https://economics.ucla.edu/person/john-asker/")

['johnasker@econ.ucla.edu']


[{'Name': 'John Asker',
  'Emails': ['johnasker@econ.ucla.edu'],
  'Phone Numbers': None,
  'URL': 'http://www.johnasker.com'}]

In [27]:
contract_list = []
link_pattern = r'href="([^"]+)"'

for url in urls:
    matches = re.search(link_pattern, url)
    if matches:
        link = matches.group(1)
    contract_list.extend(crawl_and_extract(link))

['nbau@g.ucla.edu']
['johnasker@econ.ucla.edu']
['alleras@econ.ucla.edu']
['(310) 825-3925', '3108253925', '(310) 825-3925']
['shemtov@econ.ucla.edu']
['andy@atkeson.net']
['hopen@econ.ucla.edu']
['(310) 206-8896', '(310) 825-9528']
['jvogel@econ.ucla.edu']
['mmazzocc@econ.ucla.edu']
['(310) 825-6682', '(310) 825-9528']
['mgiorcelli@econ.ucla.edu']
['6506309648']
['zhipeng.liao@econ.ucla.edu']
['(310) 794-5427']
['matzkin@econ.ucla.edu']
['(310) 825-7371']
['jay@econ.ucla.edu']
['chetverikov@econ.ucla.edu']
['mtv@econ.ucla.edu']
['baqaee@econ.ucla.edu']
['tvwachter@econ.ucla.edu']
['310-825-5665', '310-825-9528']
['iobara@econ.ucla.edu']
['fgoncalves@econ.ucla.edu']
['costa@econ.ucla.edu']
['(310) 825-4249', '(310) 825-9528', '(310) 825-1011']
['abloedel@econ.ucla.edu']
['rodrig@econ.ucla.edu']
['224-595-8758']
['hahn@econ.ucla.edu', 'chair@econ.ucla.edu', 'chair@econ.ucla.edu']
Personal website link not found on the page.
['itskhoki@econ.ucla.edu']
['mcgarry@ucla.edu']
['haanwinckel@e

In [28]:
df = pd.DataFrame(contract_list)
df

Unnamed: 0,Name,Emails,Phone Numbers,URL
0,Natalie Bau,[nbau@g.ucla.edu],,https://sites.google.com/site/nataliebau/
1,John Asker,[johnasker@econ.ucla.edu],,http://www.johnasker.com
2,Adriana Lleras-Muney,[alleras@econ.ucla.edu],"[(310) 825-3925, 3108253925, (310) 825-3925]",https://adriana-llerasmuney.squarespace.com/
3,Yotam Shem-Tov,[shemtov@econ.ucla.edu],,https://yotamshemtov.github.io/
4,Andrew Atkeson,[andy@atkeson.net],,https://sites.google.com/site/andyatkeson/
5,Hugo Hopenhayn,[hopen@econ.ucla.edu],"[(310) 206-8896, (310) 825-9528]",http://hopenhayn.weebly.com
6,Michael Rubens,,,https://michaelrubens.github.io/
7,Jonathan Vogel,[jvogel@econ.ucla.edu],,http://www.econ.ucla.edu/jvogel/
8,Maurizio Mazzocco,[mmazzocc@econ.ucla.edu],"[(310) 825-6682, (310) 825-9528]",http://www.econ.ucla.edu/mazzocco
9,Michela Giorcelli,[mgiorcelli@econ.ucla.edu],[6506309648],http://www.giorcellimichela.com/


# Q4.) Selenium

### Q4.A.) Pick a website that has useful data to a business or economic question. Put your website you plan to scrape here : https://docs.google.com/spreadsheets/d/1PJ2DOTCVCh51fn0ry1yB7qTyccR33_IXFpkYdd58MFs/edit?usp=sharing
### You must have use website that no other group has. First come first serve 

In [2]:
URL = "https://www.way.com/gas/prices/"

### Q4.B.) Use Selenium to scrape valuable information from your website and store in a dataframe.

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import us

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # You need to have chromedriver installed and in PATH

try:
    # Create an empty list to store data
    data = []

    # Iterate through each state
    for state in us.states.STATES:
        state_name = state.name.lower().replace(" ", "-")
        url = URL + state_name

        driver.get(url)

        # Wait for the spinner to disappear
        WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.CLASS_NAME, "spinner")))

        # Find the gas type dropdown
        gas_type_dropdown = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "selb")))

        # Get all gas type options
        gas_type_options = gas_type_dropdown.find_elements(By.TAG_NAME, "option")

        # Iterate through each option and scrape data
        for option in gas_type_options:
            gas_type = option.get_attribute("value")
            option.click()

            # Wait for the page to load the gas prices
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "gpric")))

            # Scrape gas prices for stores
            gas_prices = driver.find_elements(By.CLASS_NAME, "gpric")
            gas_prices = [price.text for price in gas_prices]

            # Scrape store names and addresses
            store_names = driver.find_elements(By.CLASS_NAME, "gname")
            store_addresses = driver.find_elements(By.CLASS_NAME, "adr")
            store_info = []
            for name, address in zip(store_names, store_addresses):
                name_text = name.text.strip()  
                address_text = address.text.strip()
                store_info.append((name_text, address_text))
    
    
            # Append data to the list
            for i in range(len(store_info)):
                data.append([store_info[i][0], store_info[i][1], gas_prices[i], gas_type, state.name])

finally:
    # Close the browser
    driver.quit()


df = pd.DataFrame(data, columns=["Store Name", "Location", "Gas Price", "Gas Type", "State"])

df

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome=121.0.6167.162); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF751887012+3522402]
	(No symbol) [0x00007FF7514A8352]
	(No symbol) [0x00007FF751355ABB]
	(No symbol) [0x00007FF75135AD09]
	(No symbol) [0x00007FF75135D110]
	(No symbol) [0x00007FF75135D1D0]
	(No symbol) [0x00007FF7513971FB]
	(No symbol) [0x00007FF7513BF05A]
	(No symbol) [0x00007FF75139120A]
	(No symbol) [0x00007FF7513BF270]
	(No symbol) [0x00007FF7513DBDA3]
	(No symbol) [0x00007FF7513BEE03]
	(No symbol) [0x00007FF75138F4D4]
	(No symbol) [0x00007FF7513905F1]
	GetHandleVerifier [0x00007FF7518B9B9D+3730157]
	GetHandleVerifier [0x00007FF75190F02D+4079485]
	GetHandleVerifier [0x00007FF7519075D3+4048163]
	GetHandleVerifier [0x00007FF7515DA649+718233]
	(No symbol) [0x00007FF7514B4A3F]
	(No symbol) [0x00007FF7514AFA94]
	(No symbol) [0x00007FF7514AFBC2]
	(No symbol) [0x00007FF75149F2E4]
	BaseThreadInitThunk [0x00007FFE923A7344+20]
	RtlUserThreadStart [0x00007FFE935E26B1+33]


### Q4.C.) Write a short paragraph about the businesses or research that would use the data you scraped. Describe it's value and what it can be used for.

We scraped data of top 10 gas stations & cheap fuel prices from way.com, comprising gas station names, locations, gas prices, types, and states, holds significant value for various businesses and research endeavors. Retailers and marketers within the fuel industry could leverage this dataset to conduct competitive analyses, identifying pricing trends and competitor strategies across different regions. Additionally, transportation companies could utilize this information to optimize route planning, budgeting, and fuel purchasing decisions based on real-time pricing data. Policy researchers and economists might find value in analyzing gas price fluctuations to understand broader economic trends, such as inflation or consumer spending patterns. Furthermore, urban planners and environmentalists could utilize this data to assess the availability and adoption rates of alternative fuels like E85, aiding in the development of sustainable transportation infrastructure and policies. Overall, this comprehensive dataset offers insights crucial for market intelligence, strategic decision-making, and scholarly research across various domains related to energy, economics, and sustainability.