### importing essential libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### parsing html content

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"  # website url

In [3]:
page = requests.get(url)  # requesting page content

In [4]:
soup = BeautifulSoup(page.text,'html')  # saving page content in html format

### extracting table column_names

In [5]:
table = soup.find_all('table')[0]

In [6]:
columns = table.find('tr')

In [7]:
columns = [title.text.strip() for title in columns]

In [8]:
column_names = columns[1::2]

In [9]:
column_names

['Rank',
 'Name',
 'Industry',
 'Revenue',
 'Profit',
 'Employees',
 'Headquarters[note 1]',
 'State-owned',
 'Ref.']

### scraping table rows data

In [10]:
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [11]:
# extracting data

In [12]:
data = []
count = 0
for row in rows:
    
    if count>1:
        element_data = row.find_all('td')
        x = [data.text.strip() for data in element_data] # extracting all the data except rank, state_owned,reference
        
        x.insert(0,row.th.text.strip()) # inserting rank
        
        x.pop() # poping out un-useful element
        x.pop() # poping out un-useful element
        
        if row.find('td',class_ = "table-no2") is None:  # extracting & appending state_owned
            x.append('Yes')
        else:
            x.append('No')
        
        ref = row.find('sup',class_ = 'reference') # extracting & appending reference
        x.append(ref.a.get('href'))
        
        
        data.append(x)
        
    count += 1

In [13]:
data[0]

['1',
 'Walmart',
 'Retail',
 '$611,289',
 '$11,680',
 '2,100,000',
 'United States',
 'No',
 '#cite_note-W-1']

### converting scraped data into pandas dataframe

In [14]:
df = pd.DataFrame(data, columns = column_names)

In [15]:
df

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters[note 1],State-owned,Ref.
0,1,Walmart,Retail,"$611,289","$11,680",2100000,United States,No,#cite_note-W-1
1,2,Saudi Aramco,Oil and gas,"$603,651","$159,069",70496,Saudi Arabia,Yes,#cite_note-5
2,3,State Grid Corporation of China,Electricity,"$530,009","$8,192",870287,China,Yes,#cite_note-6
3,4,Amazon,Retail,"$513,983","−$2,722",1541000,United States,No,#cite_note-7
4,5,Vitol,Commodities,"$505,000","$15,000",1560,Switzerland,No,#cite_note-8
5,6,China National Petroleum Corporation,Oil and gas,"$483,019","$21,080",1087049,China,Yes,#cite_note-10
6,7,China Petrochemical Corporation,Oil and gas,"$471,154","$9,657",527487,China,Yes,#cite_note-11
7,8,ExxonMobil,Oil and gas,"$413,680","$55,740",63000,United States,No,#cite_note-:0-12
8,9,Apple,Electronics,"$394,328","$99,803",164000,United States,No,#cite_note-13
9,10,Shell,Oil and gas,"$386,201","$20,120",93000,United Kingdom,No,#cite_note-14


### saving data as csv file

In [16]:
df.to_csv('wiki_scrap',index = False)