In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

### Task 1: Logging function

In [3]:
def log_progress(message):
    ''' This function logs the mentioned message of a given stage of the
    code execution to a log file. Function returns nothing'''
    with open('code_log.txt', 'a') as f:
        f.write(f'{datetime.now()}: {message}\n')

In [4]:
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
table_attribs = ["Bank_name", "Market_cap_USD_billions"]

In [32]:
def extract(url, table_attribs):
    ''' This function aims to extract the required
    information from the website and save it to a data frame. The
    function returns the data frame for further processing. '''

    #Extract the web page as text
    html_page = requests.get(url).text

    #Parse the html page    
    data = BeautifulSoup(html_page, 'html.parser')

    #Create an empty pandas DataFrame named df with columns as the table_attribs
    df = pd.DataFrame(columns=table_attribs)

    #Extract all 'tbody' attributes of the HTML object and then extract all the rows of the index 2 table using the 'tr' attribute.
    tables = data.find_all('tbody')
    rows = tables[0].find_all('tr')

    #Check the contents of each row, having attribute ‘td’, for the following conditions.
    #a. The row should not be empty.
    #b. The first column should contain a hyperlink.
    #c. The third column should not be '—'.

    rows_to_add = []

    for row in rows:
                 
    #     # Extract all 'td' elements in the current row
         cells = row.find_all('td')
   
         if len(cells) > 2 and cells[1].find('a'):
                    
          # Extract data for each attribute
          bank = cells[1].text.strip()
          mc_usd_billions = cells[2].text.strip()

          # Store valid entries in a dictionary
          entry = {
                 table_attribs[0]: bank,
                 table_attribs[1]: mc_usd_billions if mc_usd_billions else None
                 }
          # Add the entry to the list of rows to add
          rows_to_add.append(entry)
    #Concatenate all rows to the DataFrame
    df = pd.concat([df, pd.DataFrame(rows_to_add)], ignore_index=True)

    return df

In [34]:
df=extract(url, table_attribs)
print(df)

                                 Bank_name Market_cap_USD_billions
0                           JPMorgan Chase                  432.92
1                          Bank of America                  231.52
2  Industrial and Commercial Bank of China                  194.56
3               Agricultural Bank of China                  160.68
4                                HDFC Bank                  157.91
5                              Wells Fargo                  155.87
6                        HSBC Holdings PLC                  148.90
7                           Morgan Stanley                  140.83
8                  China Construction Bank                  139.82
9                            Bank of China                  136.81
