# WEB SCRAPING WITH URLLIB AND BEAUTIFUL SOUP

### Installing important libraries

In [78]:
!pip install beautifulsoup4 --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install requests --upgrade --quiet

### Importing useful libraries

In [79]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time

### URL CONTENT PULLING

Getting the web page to be scrapped into the code

In [80]:
url_path = "https://www.value.today/world/world-top-500-companies"
responses = requests.get(url_path)

### STATUS CODE CHECK

It is important to check the legibility of the web-page to scrape to see if scraping is possible on it.

It should be noted that this can be done using robots.txt before pulling the web content, OR;

After the web content has been pulled by checking the status_code.

Status code response in the range (200 - 299) means the web content can be scraped otherwise not.

In [81]:
responses.status_code

200

### INSPECTION AND EXTRACTION

This is the stage where we inspect the web content we just pulled, AND;

Extraction of the useful data / information we wish to document for further research and analysis.

In [82]:
soup = BeautifulSoup(responses.text, 'html.parser')
companies = soup.find_all('div', {'class' : 'row well views-row'})

### Data Extraction Operation

In [83]:
world_rank = []
for tag in companies:
    rank = tag.find('div', {'class':'views-field views-field-field-world-rank-jan-2020 clearfix col-sm-12'})
    try:
        world_rank.append(int(rank.find('span').text))
    except AttributeError:
        world_rank.append(None)

market_capital = []
for tag in companies:
    capital_market = tag.find('div', {'class':'views-field views-field-field-market-value-jan-2020 clearfix col-sm-12'})
    try:
        market_capital_value = capital_market.find('span', {'class':'field-content'})
        market_capital.append(market_capital_value.text)
    except AttributeError:
        market_capital.append(None)

head_quarters = []
for tag in companies:
    head_quarter = tag.find('div', {'class':'views-field views-field-field-headquarters-of-company clearfix col-sm-12'})
    try:
        head_quarters.append(head_quarter.find('span').text)
    except AttributeError:
        head_quarters.append(None)

sectors = []
for tag in companies:
    sector_tag = tag.find('div', {'class':'views-field views-field-field-company-category-primary clearfix col-sm-12'})
    try:
        sectors.append(sector_tag.find('span').text)
    except AttributeError:
        sectors.append(None)

annual_revenue = []
for tag in companies:
    revenue = tag.find('div', {'class' : 'views-field views-field-field-annual-revenue clearfix col-sm-12'})
    try:
        annual_revenue.append(revenue.find('span').text)
    except AttributeError:
        annual_revenue.append(None)

annual_net_income = []
for tag in companies:
    net_income = tag.find('div', {'class' : 'views-field views-field-field-annual-net-income-lc clearfix col-sm-12'})
    try:
        annual_net_income.append(net_income.find('span').text)
    except AttributeError:
        annual_net_income.append(None)

total_assets = []
for tag in companies:
    assets = tag.find('div', {'class' : 'views-field views-field-field-total-assets clearfix col-sm-12'})
    try:
        total_assets.append(assets.find('span').text)
    except AttributeError:
        total_assets.append(None)

total_employees = []
for tag in companies:
    employees = tag.find('div',{'class':'views-field views-field-field-employee-count clearfix col-sm-12'})
    try:
        total_employees.append(employees.find('span').text)
    except AttributeError:
        total_employees.append(None)

ceos = []
for tag in companies:
    ceo_ = tag.find('div', {'class':'views-field views-field-field-ceo clearfix col-sm-12'})
    try:
        ceo = ceo_.find('span', {'class':'field-content'})
        ceos.append(ceo.text)
    except AttributeError:
        ceos.append(None)


### PARSING
This is the stage we parse the extracted data as .csv file for keep

### CREATING DICTIONARY OF THE EXTRACTED DATA

In [84]:
companies = {
    "World Rank": world_rank,
    "Market Capital": market_capital,
    "Headquarters": head_quarters,
    "Sectors": sectors,
    "Annual Revenue": annual_revenue,
    "Annual Net Income": annual_net_income,
    "Total Assests": total_assets,
    "Total Employees": total_employees,
    "CEOs": ceos
}

### CONVERTING DICTIONARY INTO DATAFRAME

In [85]:
company_data_frame = pd.DataFrame(companies, dtype=object)

### SAVING THE DATAFRAME AS A CSV FILE

In [91]:
company_data_frame.to_csv("/home/computech/Documents/Mine/Data_Science/web-scraping/withBeautifulSoup/Top 500 Companies.csv")

In [92]:
company_data_frame

Unnamed: 0,World Rank,Market Capital,Headquarters,Sectors,Annual Revenue,Annual Net Income,Total Assests,Total Employees,CEOs
0,1,1898.10 Billion USD,Saudi Arabia,"Energy, Oil and Gas, Chemicals, Oil Refining, ...",Saudi Aramco Annual Revenues for December-2020...,Saudi Aramco Annual Net Income for December-20...,Total Assets of Saudi Aramco as on December-20...,66800,Amin H. Al-Nasser
1,2,1323.00 Billion USD,USA,"Technology, Mobiles & Accessories, Electronics...",Apple Annual Revenues for September-2020 endin...,Apple Annual Net Income for September-2020 end...,"Apple Total Assets as on June-27-2020 is 317,3...",147000,Tim Cook
2,3,1215.00 Billion USD,USA,"Technology, Software and IT, Laptops, Video Ga...",Microsoft Annual Revenues for June-2020 ending...,Microsoft Annual Net Income for June-2020 endi...,"Total Assets of Microsoft Corporation is 301,3...",156439,Satya Nadella
3,4,943.90 Billion USD,USA,"Technology, Internet or Mobile App Based Busin...",Alphabet Annual Revenues for December-2020 end...,Alphabet Annual Net Income for December-2020 e...,299.243 Billion USD as on Sep-2020,156500,Sundar Pichai
4,5,941.03 Billion USD,USA,"eCommerce, Internet or Mobile App Based Busine...",Amazon Annual Revenues for December-2020 endin...,Amazon Annual Net Income for December-2020 end...,"Total Assets of Amazon as on Jun-2020 is 110,9...",1335000,Andy Jassy
...,...,...,...,...,...,...,...,...,...
495,471,32.65 Billion USD,Japan,"Consumer Defensive, Retail, Super Markets, Con...",SEVEN & I HOLDINGS Annual Revenues as on Decem...,SEVEN & I HOLDINGS Annual Net Income as on Dec...,,98039,Ryuichi Isaka
496,472,32.61 Billion USD,Italy,"Financial Services, Insurance",ASSICURAZIONI GENERALI Annual Revenues as on D...,ASSICURAZIONI GENERALI Annual Net Income as on...,,72000,Philippe Donnet
497,473,32.53 Billion USD,USA,"Technology, Electronics, Cables and Wires, Ele...",AMPHENOL CORPORATION Annual Revenues as on Dec...,AMPHENOL CORPORATION Annual Net Income as on D...,,80000,Richard Adam Norwitt
498,474,32.52 Billion USD,USA,"Consumer Defensive, Food Products, FMCG, Dairy...",GENERAL MILLS Annual Revenues as on December-2...,GENERAL MILLS Annual Net Income as on December...,GENERAL MILLS Total Assets as on August-2020 i...,35000,Jeff Harmening
