In [131]:
from bs4 import BeautifulSoup
import requests

In [133]:
url = 'http://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html')

In [134]:
print(soup)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of largest companies by revenue - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-cont

In [135]:
soup.find('table')

<table class="wikitable sortable sticky-header-multi sort-under" style="text-align:left;">
<tbody><tr>
<th rowspan="2" scope="col">Rank
</th>
<th rowspan="2" scope="col">Name
</th>
<th rowspan="2" scope="col">Industry
</th>
<th scope="col">Revenue
</th>
<th scope="col">Profit
</th>
<th rowspan="2" scope="col">Employees
</th>
<th rowspan="2" scope="col">Headquarters<sup class="reference" id="cite_ref-4"><a href="#cite_note-4"><span class="cite-bracket">[</span>note 1<span class="cite-bracket">]</span></a></sup>
</th>
<th rowspan="2" scope="col"><a href="/wiki/State-owned_enterprise" title="State-owned enterprise">State-owned</a>
</th>
<th class="unsortable" rowspan="2" scope="col"><abbr title="Reference(s)">Ref.</abbr>
</th></tr>
<tr>
<th colspan="2" scope="col"><small>USD (in millions)</small>
</th></tr>
<tr>
<th scope="col">1
</th>
<td><a href="/wiki/Walmart" title="Walmart">Walmart</a></td>
<td><a href="/wiki/Retail" title="Retail">Retail</a></td>
<td style="text-align:center;"><span

In [136]:
table = soup.find_all('table')[1]

In [137]:
print(table)

<table class="wikitable sortable plainrowheaders" style="text-align: center">
<caption>Breakdown by country
</caption>
<tbody><tr>
<th scope="col">Rank
</th>
<th scope="col">Country
</th>
<th scope="col">Companies
</th></tr>
<tr>
<th scope="row">1
</th>
<td style="text-align: left;"><span class="flagicon nowrap"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/40px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/60px-Flag_of_the_United_States.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/United_States" title="United States">United States of America</a></td>
<td>22
</td></tr>
<tr>
<th scope="row">2
</th>
<td style="text-align: left;"><span class="flagicon nowrap"><span class="mw-image-border" typeof="mw:File

In [167]:
world_titles = table.find_all('th')

In [169]:
world_titles

[<th scope="col">Rank
 </th>,
 <th scope="col">Country
 </th>,
 <th scope="col">Companies
 </th>,
 <th scope="row">1
 </th>,
 <th scope="row">2
 </th>,
 <th scope="row">3
 </th>,
 <th scope="row">4
 </th>,
 <th scope="row">4
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>,
 <th scope="row">5
 </th>]

In [171]:
world_table_titles = [
    title.text.strip() 
    for title in world_titles 
    if not title.text.strip().isdigit()
]

print(world_table_titles)


['Rank', 'Country', 'Companies']


In [173]:
import pandas as pd

In [175]:
df = pd.DataFrame(columns = world_table_titles)

df

Unnamed: 0,Rank,Country,Companies


In [177]:
column_data = table.find_all('tr')

In [179]:
expected_columns = len(df.columns)

for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]

    if len(individual_row_data) < expected_columns:
        individual_row_data += [''] * (expected_columns - len(individual_row_data))  # Pad missing
    elif len(individual_row_data) > expected_columns:
        individual_row_data = individual_row_data[:expected_columns]  # Trim extra

    df.loc[len(df)] = individual_row_data


In [181]:
df

Unnamed: 0,Rank,Country,Companies
0,United States of America,22,
1,China,11,
2,Germany,4,
3,United Kingdom,2,
4,Switzerland,2,
5,Japan,1,
6,France,1,
7,Italy,1,
8,Netherlands,1,
9,South Korea,1,


In [183]:
# Step 1: Rename columns to their correct meaning
df.columns = ['Country', 'Companies', 'drop']

# Step 2: Drop the last column (junk column)
df = df.drop(columns='drop')

# Step 3: Sort the countries by number of companies (descending)
df = df.sort_values(by='Companies', ascending=False).reset_index(drop=True)

# Step 4: Add proper Rank
df.insert(0, 'Rank', df.index + 1)



# Step 5: Show the cleaned DataFrame
print(df)


    Rank                   Country Companies
0      1                   Germany         4
1      2  United States of America        22
2      3            United Kingdom         2
3      4               Switzerland         2
4      5                     China        11
5      6                     Japan         1
6      7                    France         1
7      8                     Italy         1
8      9               Netherlands         1
9     10               South Korea         1
10    11              Saudi Arabia         1
11    12                 Singapore         1
12    13                    Taiwan         1


In [185]:
df.to_csv(r'C:\Users\yashb\OneDrive\Desktop\data\python\comp.csv', index = False)
