# Largest Banks Project

## Import Librabies

In [20]:
#!python3 -m pip install -r requirement.txt

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

In [22]:
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
table_attbts_extract = ["Name", "MC_USD_Billion"]
table_attbts_final = ["Name", "MC_USD_Billion", "MC_GBP_Billion", "MC_EUR_Billion", "MC_INR_Billion"]
csv_path = "./Largest_banks_data.csv"
db_name = "Banks.db"
table_name = "Largest_banks"
log_file = "code_log.txt"

### Download Exchange Rate

#### Windows

In [23]:
!curl -O https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100    45  100    45    0     0     25      0  0:00:01  0:00:01 --:--:--    25


#### Linux

In [24]:
#!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv

In [25]:
rate_df = pd.read_csv("exchange_rate.csv")
key = rate_df['Currency']
value = rate_df['Rate']
rate_dict = dict(zip(key, value))

## Task 1

In [26]:
def log_progress(message):
    time_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now()
    timestamp = now.strftime(time_format)
    with open(log_file, 'a') as f:
        f.write(timestamp + " : " + message + "\n")

## Task 2

In [27]:
def extract(url, table_attbts):
    page = requests.get(url)
    data = BeautifulSoup(page.text, 'html.parser')
    tables = data.find_all('tbody')
    # tables[0] is the table we need to find
    rows = tables[0].find_all('tr')
    df = pd.DataFrame(columns=table_attbts_extract)
    for row in rows:
        col = row.find_all('td')
        if(len(col) != 0):
            name, market_cap = col[1].text.strip(), col[2].text.strip()
            # print(name, market_cap, sep = "\t")
            data_dict = {table_attbts_extract[0]: name, table_attbts_extract[1]: market_cap}
            per_df = pd.DataFrame(data_dict, index=[0])
            df = pd.concat([df, per_df], ignore_index=True)

    return df

In [34]:
banks_df = extract(url, table_attbts_extract)

In [35]:
banks_df

Unnamed: 0,Name,MC_USD_Billion
0,JPMorgan Chase,432.92
1,Bank of America,231.52
2,Industrial and Commercial Bank of China,194.56
3,Agricultural Bank of China,160.68
4,HDFC Bank,157.91
5,Wells Fargo,155.87
6,HSBC Holdings PLC,148.9
7,Morgan Stanley,140.83
8,China Construction Bank,139.82
9,Bank of China,136.81


## Task 3

In [28]:
def transform(df):
    market_cap_list = [float(x) for x in df['MC_USD_Billion'].to_list()] # convert from str to float
    format_str = "MC_{}_Billion"
    for key, value in rate_dict.items():
        df[format_str.format(key)] = [np.round(x * value, 2) for x in  market_cap_list]
    return df

In [36]:
banks_df = transform(banks_df)
banks_df

Unnamed: 0,Name,MC_USD_Billion,MC_EUR_Billion,MC_GBP_Billion,MC_INR_Billion
0,JPMorgan Chase,432.92,402.62,346.34,35910.71
1,Bank of America,231.52,215.31,185.22,19204.58
2,Industrial and Commercial Bank of China,194.56,180.94,155.65,16138.75
3,Agricultural Bank of China,160.68,149.43,128.54,13328.41
4,HDFC Bank,157.91,146.86,126.33,13098.63
5,Wells Fargo,155.87,144.96,124.7,12929.42
6,HSBC Holdings PLC,148.9,138.48,119.12,12351.26
7,Morgan Stanley,140.83,130.97,112.66,11681.85
8,China Construction Bank,139.82,130.03,111.86,11598.07
9,Bank of China,136.81,127.23,109.45,11348.39


## Task 4

In [29]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

## Task 5

In [30]:
def load_to_db(df, sql_connection):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

## Task 6

In [31]:
def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

## Task 7

In [32]:
sql_connection = sqlite3.connect(db_name)
log_progress("Connected to Database! Ready for process.")

banks_df = extract(url, table_attbts_extract)
log_progress("Data Extracted Completely, ready for tranformation.")

print(banks_df)

banks_df = transform(banks_df)
log_progress("Data Transformated Completely, loading to csv.")

load_to_csv(banks_df, csv_path)
log_progress("Loaded to csv, loading to database.")

load_to_db(banks_df, sql_connection)
log_progress("Data Loading Succesfully, loading query statement.")

sql_statement = f"SELECT * FROM {table_name} WHERE MC_USD_Billion > 150"
run_query(sql_statement, sql_connection)

log_progress("Process End.")

sql_connection.close()

   Name MC_USD_Billion
0   SEP            Oct
1    08               
2  2023           2024


ValueError: could not convert string to float: 'Oct'

In [None]:
sql_connection = sqlite3.connect(db_name)
statement = f"SELECT AVG(MC_GBP_Billion) FROM Largest_banks"
run_query(statement, sql_connection)

SELECT AVG(MC_GBP_Billion) FROM Largest_banks
   AVG(MC_GBP_Billion)
0              151.987
