# Extract, Transform, Load with Webscrapping


This python script scrapes information about the largest banks by market capitalization from a wiki page, and stores the information both as a CSV and as a JSON file.

## Imports Libraries

In [1]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd

## Extract Data Using Web Scraping

In [2]:
# Connect to website and download site information
html_data = requests.get('https://en.wikipedia.org/wiki/List_of_largest_banks').text

# Parse website information using BeautifulSoup module
soup_html = soup(html_data, "lxml")

In [4]:
# Create empty dataframe to store web information of interest
data = pd.DataFrame(columns=["Name", "Market Cap (US$ Billion)"])

# Find information of interest, in this case, the bank names and their market cap, then store in a dataframe
for row in soup_html.find_all('tbody')[3].find_all('tr'):
    col = row.find_all('td')
    try:
        name = col[1].text.strip()
        cap = float(col[2].text.strip())
        data = data.append({"Name":name, "Market Cap (US$ Billion)":cap}, ignore_index = True)
    except Exception as e:
        pass

# Print dataframe
data

Unnamed: 0,Name,Market Cap (US$ Billion)
0,JPMorgan Chase,488.470
1,Bank of America,379.250
2,Industrial and Commercial Bank of China,246.500
3,Wells Fargo,308.013
4,China Construction Bank,257.399
...,...,...
65,Ping An Bank,37.993
66,Standard Chartered,37.319
67,United Overseas Bank,35.128
68,QNB Group,33.560


### Load Data to a CSV File

In [5]:
data.to_csv('LargestBanksByCap.csv')

### Load Data to a JSON File

In [6]:
data.to_json('LargestBanksByCap.json')

## Author


Taiwo Fawumi
taiwo.fawumi@yahoo.com