# Scraping Data From A Real Website + Pandas

In [1]:
# Importing BeautifulSoup from the bs4 library for web scraping
# Importing the requests library to make http requests to websites
from bs4 import BeautifulSoup
import requests

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"
# Sending an HTTP GET request to the url and storing the response in the 'page' variable
page = requests.get(url)
# Parsing the html content of the page with BeautifulSoup to make it easier to extract data
soup = BeautifulSoup(page.text, 'html')

List of The Largest Public / Publicly Traded Companies

In [5]:
# Extracting the 1st table on the page
table = soup.find_all("table")[0]
print(table)

In [31]:
# Extracting the headers (titles) of the table columns
# Finding all header cells ('th' tags) in the table
world_titles = table.find_all("th")

In [33]:
# Stripping any extra whitespace and storing the text of each header in a list
world_table_titles = [title.text.strip() for title in world_titles]
print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [35]:
# Importing Pandas to create a DataFrame for the scraped data
import pandas as pd

In [37]:
# Creating a DataFrame with the extracted headers as columns
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [39]:
# Finding all rows ('tr' tags) in the table and storing them in 'column_data'
column_data = table.find_all("tr")

In [41]:
# Looping through each row in the table, starting from the second row (skipping the header)
for row in column_data[1:]:
    # Extracting all the data cells ('td' tags) in the current row
    row_data = row.find_all("td")
    # Stripping any whitespace and extracting the text from each cell
    individual_row_data = [data.text.strip() for data in row_data]
    print(individual_row_data)

    # Getting the current length of the DataFrame
    length = len(df)
    # Adding the extracted row data as a new row in the DataFrame
    df.loc[length] = individual_row_data

['1', 'Walmart', 'Retail', '648,125', '6.0%', '2,100,000', 'Bentonville, Arkansas']
['2', 'Amazon', 'Retail and cloud computing', '574,785', '11.9%', '1,525,000', 'Seattle, Washington']
['3', 'Apple', 'Electronics industry', '383,482', '-2.8%', '161,000', 'Cupertino, California']
['4', 'UnitedHealth Group', 'Healthcare', '371,622', '14.6%', '440,000', 'Minnetonka, Minnesota']
['5', 'Berkshire Hathaway', 'Conglomerate', '364,482', '20.7%', '396,500', 'Omaha, Nebraska']
['6', 'CVS Health', 'Healthcare', '357,776', '10.9%', '259,500', 'Woonsocket, Rhode Island']
['7', 'ExxonMobil', 'Petroleum industry', '344,582', '-16.7%', '61,500', 'Spring, Texas']
['8', 'Alphabet', 'Technology and cloud computing', '307,394', '8.7%', '182,502', 'Mountain View, California']
['9', 'McKesson Corporation', 'Health', '276,711', '4.8%', '48,000', 'Irving, Texas']
['10', 'Cencora', 'Pharmacy wholesale', '262,173', '9.9%', '44,000', 'Conshohocken, Pennsylvania']
['11', 'Costco', 'Retail', '242,290', '6.8%', '3

In [43]:
# Displaying the dataframe
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


In [45]:
# This method is used to export the DataFrame to a CSV file
# index = False, its prevents pandas from writing row indices (numbers) to the CSV file.
df.to_csv(r"C:\Users\zulya\Desktop\Z L T O\Phyton Tutorial\Phyton Web Scrapping\List of the largest public 2024.csv", index = False)

Table of Largest Private Companies

In [50]:
second_table = soup.find_all("table")[1]
#print(second_table)

In [58]:
second_table_titles = second_table.find_all("th")

In [60]:
second_table_titles = [title.text.strip() for title in second_table_titles]
print(second_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD billions)', 'Employees', 'Headquarters']


In [62]:
df = pd.DataFrame(columns = second_table_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters


In [66]:
second_table_column_data = second_table.find_all("tr")

In [68]:
for row in second_table_column_data[1:]:
    row_data = row.find_all("td")
    second_table_row_data = [data.text.strip() for data in row_data]
    print(second_table_row_data)

    length = len(df)
    df.loc[length] = second_table_row_data

['1', 'Cargill', 'Food industry', '177', '160,000', 'Minnetonka, Minnesota']
['2', 'Koch Industries', 'Conglomerate', '125', '120,000', 'Wichita, Kansas']
['3', 'Publix Super Markets', 'Retail', '54.5', '250,000', 'Winter Haven, Florida']
['4', 'Mars, Incorporated', 'Food industry', '47', '140,000', 'McLean, Virginia']
['5', 'H-E-B', 'Retail', '43.6', '145,000', 'San Antonio, Texas']
['6', 'Reyes Holdings', 'Wholesaling', '40', '36,000', 'Rosemont, Illinois']
['7', 'Enterprise Holdings', 'Car rental', '35', '90,000', 'Clayton, Missouri']
['8', 'C&S Wholesale Grocers', 'Wholesaling', '34.7', '15,000', 'Keene, New Hampshire']
['9', "Love's", 'Petroleum industry and Retail', '26.5', '40,000', 'Oklahoma City, Oklahoma']
['10', "Southern Glazer's Wine and Spirits", 'Food industry', '26.0', '24,000', 'Miramar, Florida']


In [70]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD billions),Employees,Headquarters
0,1,Cargill,Food industry,177.0,160000,"Minnetonka, Minnesota"
1,2,Koch Industries,Conglomerate,125.0,120000,"Wichita, Kansas"
2,3,Publix Super Markets,Retail,54.5,250000,"Winter Haven, Florida"
3,4,"Mars, Incorporated",Food industry,47.0,140000,"McLean, Virginia"
4,5,H-E-B,Retail,43.6,145000,"San Antonio, Texas"
5,6,Reyes Holdings,Wholesaling,40.0,36000,"Rosemont, Illinois"
6,7,Enterprise Holdings,Car rental,35.0,90000,"Clayton, Missouri"
7,8,C&S Wholesale Grocers,Wholesaling,34.7,15000,"Keene, New Hampshire"
8,9,Love's,Petroleum industry and Retail,26.5,40000,"Oklahoma City, Oklahoma"
9,10,Southern Glazer's Wine and Spirits,Food industry,26.0,24000,"Miramar, Florida"


In [72]:
df.to_csv(r"C:\Users\zulya\Desktop\Z L T O\Phyton Tutorial\Phyton Web Scrapping\List of the largest private companies 2024.csv", index = False)