# Fetching and parsing HTML
To start web scraping, you need to fetch the HTML content of a webpage and parse it using Beautiful Soup. Here's a step-by-step example:

In [16]:
# 用requests来获取网页信息
import requests

In [17]:
# 用BeautifulSoup来翻译获取的网页信息
from bs4 import BeautifulSoup

In [3]:
url = 'https://en.wikipedia.org/wiki/IBM'

In [12]:
# Send an HTTP GET request to the webpage
response = requests.get(url)

In [13]:
# response是一个类
print(type(response))

<class 'requests.models.Response'>


In [14]:
# Store the HTML content in a variable
html_content = response.text

In [15]:
# html_content是一个str变量，存储了网页的所有原始信息
print(type(html_content))

<class 'str'>


In [18]:
# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

In [19]:
# Display a snippet of the HTML content
print(html_content[:500])

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-


In [21]:
# soup是一个由BeautifulSoup生成的对象
print(type(soup))

<class 'bs4.BeautifulSoup'>


# Navigating the HTML structure
BeautifulSoup represents HTML content as a tree-like structure, allowing for easy navigation. You can use methods like *find_all* to filter and extract specific HTML elements. For example, to find all anchor tags() and print their text:

In [22]:
# Find all <a> tags (anchor tags) in the HTML
links = soup.find_all('a')
# Iterate through the list of links and print their text
for link in links:
    print(link.text)

Jump to content
Main page
Contents
Current events
Random article
About Wikipedia
Contact us
Help
Learn to edit
Community portal
Recent changes
Upload file








Search

Donate
Create account
Log in
Donate
 Create account
 Log in
learn more
Contributions
Talk

(Top)



1
History




1.1
1910s–1950s




1.2
1960s–1980s




1.3
1990s–2000s




1.4
2010s–present




2
Corporate affairs




2.1
Business trends




2.2
Board and shareholders




2.3
Headquarters and offices




3
Products




3.1
Hardware




3.1.1
Mainframe computers




3.1.2
Microprocessors




3.1.3
Quantum Computing




3.2
Software




3.3
Cloud services




3.4
Artificial intelligence




4
Consulting




5
Research




5.1
Patents




6
Brand and reputation




6.1
Environmental




7
People and culture




7.1
Employees




7.1.1
Notable current and former employees




7.2
Workplace culture




7.3
Labor relations




8
See also




9
Notes




10
References




11
Further reading




12
External links


Afrikaan

In [23]:
import pandas as pd

In [25]:
tables = pd.read_html(url)

In [26]:
print(type(tables))

<class 'list'>


In [29]:
print(tables)

[                                       0  \
0        1972–current logo, by Paul Rand   
1   IBM CHQ in Armonk, New York, in 2014   
2                             Trade name   
3                               Formerly   
4                           Company type   
5                              Traded as   
6                                   ISIN   
7                               Industry   
8                           Predecessors   
9                                Founded   
10                              Founders   
11                          Headquarters   
12                           Area served   
13                            Key people   
14                              Products   
15                                Brands   
16                              Services   
17                               Revenue   
18                      Operating income   
19                            Net income   
20                          Total assets   
21                          Tot

In [44]:
df = tables[2]
print(df)

      Year  Revenue (US$ bn)  Net income (US$ bn)  Employees
0     2014              92.7                 12.0     379592
1     2015              81.7                 13.1     377757
2     2016              79.9                 11.8     380300
3     2017              79.1                  5.7     366600
4     2018              79.5                  8.7     350600
5     2019              77.1                  9.4     352600
6     2020              73.6                  5.5     345900
7  2021[b]              57.3                  5.7     282100
8     2022              60.5                  1.6     288300
9     2023              61.8                  7.5     282200


In [48]:
df.columns = [['Year', 'Revenue($bn)', 'Net_Income($bn)', 'Employees']]

In [50]:
df.iloc[7, 0] = 2021
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Employees
0,2014,92.7,12.0,379592
1,2015,81.7,13.1,377757
2,2016,79.9,11.8,380300
3,2017,79.1,5.7,366600
4,2018,79.5,8.7,350600
5,2019,77.1,9.4,352600
6,2020,73.6,5.5,345900
7,2021,57.3,5.7,282100
8,2022,60.5,1.6,288300
9,2023,61.8,7.5,282200


In [52]:
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Employees
0,2014,92.7,12.0,379592
1,2015,81.7,13.1,377757
2,2016,79.9,11.8,380300
3,2017,79.1,5.7,366600
4,2018,79.5,8.7,350600
5,2019,77.1,9.4,352600
6,2020,73.6,5.5,345900
7,2021,57.3,5.7,282100
8,2022,60.5,1.6,288300
9,2023,61.8,7.5,282200


In [78]:
df['Cost($bn)'] = df['Revenue($bn)'] - df['Net_Income($bn)']
# 由于df['Revenue($bn)']是一个DataFrame数据，无法直接用于运算，需要转换成Series，使用squeeze()方法

ValueError: Cannot set a DataFrame with multiple columns to the single column Cost($bn)

In [76]:
df['Costs'] = df['Revenue($bn)'].squeeze() - df['Net_Income($bn)'].squeeze()

In [77]:
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Employees,Costs
0,2014,92.7,12.0,379592,80.7
1,2015,81.7,13.1,377757,68.6
2,2016,79.9,11.8,380300,68.1
3,2017,79.1,5.7,366600,73.4
4,2018,79.5,8.7,350600,70.8
5,2019,77.1,9.4,352600,67.7
6,2020,73.6,5.5,345900,68.1
7,2021,57.3,5.7,282100,51.6
8,2022,60.5,1.6,288300,58.9
9,2023,61.8,7.5,282200,54.3


In [80]:
df = df[['Year', 'Revenue($bn)', 'Net_Income($bn)', 'Costs', 'Employees']]

In [81]:
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Costs,Employees
0,2014,92.7,12.0,80.7,379592
1,2015,81.7,13.1,68.6,377757
2,2016,79.9,11.8,68.1,380300
3,2017,79.1,5.7,73.4,366600
4,2018,79.5,8.7,70.8,350600
5,2019,77.1,9.4,67.7,352600
6,2020,73.6,5.5,68.1,345900
7,2021,57.3,5.7,51.6,282100
8,2022,60.5,1.6,58.9,288300
9,2023,61.8,7.5,54.3,282200


In [88]:
df['Costs/Empee($k)'] = df['Costs'].squeeze() / df['Employees'].squeeze() * pow(10, 6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Costs/Empee($k)'] = df['Costs'].squeeze() / df['Employees'].squeeze() * pow(10, 6)


In [91]:
df.drop(columns=['Costs/Empee'])

  df.drop(columns=['Costs/Empee'])


Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Costs,Employees,Costs/Empee($k)
0,2014,92.7,12.0,80.7,379592,212.596683
1,2015,81.7,13.1,68.6,377757,181.598223
2,2016,79.9,11.8,68.1,380300,179.069156
3,2017,79.1,5.7,73.4,366600,200.218221
4,2018,79.5,8.7,70.8,350600,201.939532
5,2019,77.1,9.4,67.7,352600,192.002269
6,2020,73.6,5.5,68.1,345900,196.87771
7,2021,57.3,5.7,51.6,282100,182.91386
8,2022,60.5,1.6,58.9,288300,204.301075
9,2023,61.8,7.5,54.3,282200,192.416726


In [96]:
df['Revenue/Empee'] = df.loc[:,'Revenue($bn)'].squeeze() / df.loc[:,'Employees'].squeeze() * pow(10, 6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Revenue/Empee'] = df.loc[:,'Revenue($bn)'].squeeze() / df.loc[:,'Employees'].squeeze() * pow(10, 6)


In [97]:
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Costs,Employees,Costs/Empee,Costs/Empee($k),Net_Incm/Empee,Revenue/Empee
0,2014,92.7,12.0,80.7,379592,0.000213,212.596683,244.209572,244.209572
1,2015,81.7,13.1,68.6,377757,0.000182,181.598223,216.276601,216.276601
2,2016,79.9,11.8,68.1,380300,0.000179,179.069156,210.097292,210.097292
3,2017,79.1,5.7,73.4,366600,0.0002,200.218221,215.766503,215.766503
4,2018,79.5,8.7,70.8,350600,0.000202,201.939532,226.754136,226.754136
5,2019,77.1,9.4,67.7,352600,0.000192,192.002269,218.661373,218.661373
6,2020,73.6,5.5,68.1,345900,0.000197,196.87771,212.77826,212.77826
7,2021,57.3,5.7,51.6,282100,0.000183,182.91386,203.119461,203.119461
8,2022,60.5,1.6,58.9,288300,0.000204,204.301075,209.85085,209.85085
9,2023,61.8,7.5,54.3,282200,0.000192,192.416726,218.993622,218.993622


In [102]:
df = df.drop(columns=['Costs/Empee', 'Net_Incm/Empee', 'Revenue/Empee'])
df

  df = df.drop(columns=['Costs/Empee', 'Net_Incm/Empee', 'Revenue/Empee'])


Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Costs,Employees,Costs/Empee($k)
0,2014,92.7,12.0,80.7,379592,212.596683
1,2015,81.7,13.1,68.6,377757,181.598223
2,2016,79.9,11.8,68.1,380300,179.069156
3,2017,79.1,5.7,73.4,366600,200.218221
4,2018,79.5,8.7,70.8,350600,201.939532
5,2019,77.1,9.4,67.7,352600,192.002269
6,2020,73.6,5.5,68.1,345900,196.87771
7,2021,57.3,5.7,51.6,282100,182.91386
8,2022,60.5,1.6,58.9,288300,204.301075
9,2023,61.8,7.5,54.3,282200,192.416726


In [103]:
df.loc[:,'Revenue/Empee($k)'] = df.loc[:,'Revenue($bn)'].squeeze() / df.loc[:,'Employees'].squeeze() * pow(10, 6)

In [104]:
df

Unnamed: 0,Year,Revenue($bn),Net_Income($bn),Costs,Employees,Costs/Empee($k),Revenue/Empee($k)
0,2014,92.7,12.0,80.7,379592,212.596683,244.209572
1,2015,81.7,13.1,68.6,377757,181.598223,216.276601
2,2016,79.9,11.8,68.1,380300,179.069156,210.097292
3,2017,79.1,5.7,73.4,366600,200.218221,215.766503
4,2018,79.5,8.7,70.8,350600,201.939532,226.754136
5,2019,77.1,9.4,67.7,352600,192.002269,218.661373
6,2020,73.6,5.5,68.1,345900,196.87771,212.77826
7,2021,57.3,5.7,51.6,282100,182.91386,203.119461
8,2022,60.5,1.6,58.9,288300,204.301075,209.85085
9,2023,61.8,7.5,54.3,282200,192.416726,218.993622
