# <span style="color:darkblue"> Project 1 Zeyuan Wang </span>

In [1]:
# import packages 


import pandas as pd
from lxml import html
# Work with time data
import time 

# Conduct HTTP requests
import requests

# Construct tree structure of HTML data
import html5lib

# Parse HTML data obtained from scraping
from bs4 import BeautifulSoup

# Import webdriver for chrome
from webdriver_manager.chrome import ChromeDriverManager



# Automate navigating within browser (SELENIUM)
#------ Key: Manage keys
#------ Select: Obtain features from website
#------ WebDriverWait: Add wait times implicitly
#------ By: Use common information locator strategies
#------ EC and Options: Browser configuration
#------ remote.command: Check whether browser is active

from selenium import webdriver #to automate the navigating within the browser
from selenium.webdriver.chrome.service import service
from selenium.webdriver.common.keys    import Keys
from selenium.webdriver.support.ui     import Select
from selenium.webdriver.support.ui     import WebDriverWait 
from selenium.webdriver.common.by      import By
from selenium.webdriver.support        import expected_conditions as EC
from selenium.webdriver.chrome.options import Options #to use properties of the chrome webbrowser
from selenium.webdriver.remote.command import Command # Use to check whether the web driver is active


In [8]:
#initialize web driver



# Open browser to start web scraping
options = webdriver.ChromeOptions()
options.headless = False 
driver = webdriver.Chrome(options=options)

# Navigate to specific website
starting_url = 'https://www.bu.edu/gdp/chinese-loans-to-africa-database/'
driver.get(starting_url)


In [9]:
#there is a button to be clicked on to get to the main data page 

time.sleep(3)  # Adjust as necessary; better to use WebDriverWait for production code 

# Find the button using its XPath
explore_button = driver.find_element(By.XPATH, "/html/body/div/div/div/div[4]/button")
    
explore_button.click()

# Wait to see the result
time.sleep(5) 


<font size = "5">
From this data, we want to extract data from the first table, which have project information on Chinese loans to Africa in the Energy sector. 

<font size = "5">
First, extract all information needed from the first table. 

In [11]:
# /html/body/div/div/div/div[4]/table[1]/thead/tr[3]/td[1]

# XPath for the first table
table_xpath = '/html/body/div/div/div/div[4]/table[1]'

# Extract table rows
rows = driver.find_elements(By.XPATH, table_xpath )

# Extract all project rows

project_rows = driver.find_elements(By.CLASS_NAME, "loanRow")


In [12]:
# Initialize list to store data
data = []

# Loop through the project rows and extract relevant information
for row in project_rows:
    cells = row.find_elements(By.TAG_NAME, "td")
    project_data = [cell.text for cell in cells]
    data.append(project_data)

# Define column names
columns = ["Country Project No.", "Project Name", "Year", "Lender", "Allocation (USD)"]

# Create a DataFrame with the extracted data
df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df)

    Country Project No.                                       Project Name  \
0                     1       Luanda Electrical Network Expansion, Phase I   
1                     2                        Lubango Electricity Network   
2                     3                     Namibe and Tombowa Electricity   
3                     4  Luanda Electrical Network Rehabilitation and E...   
4                     5       Quifangondo-Mabubas Transmission Line (60kV)   
..                  ...                                                ...   
202                   1                    Donsin Solar Power Plant (25MW)   
203                   1               Boali No.3 Hydropower Project (10MW)   
204                   2        Sakai Solar Photovoltaic Power Plant (15MW)   
205                   1  Addis-Djibouti Railway Electrification, Nagad-...   
206                   1            Financing for Central Electricity Board   

     Year  Lender Allocation (USD)  
0    2002  CHEXIM         

In [13]:
# the information of country name is also in the table, as the first row for each country. 
# we need to extract the information. 

# Locate the first table again
table_xpath = "/html/body/div/div/div/div[4]/table[1]"
table = driver.find_element(By.XPATH, table_xpath)

# Initialize a list to store country names and loan numbers
country_data = []

# Loop through the rows with country information
rows = table.find_elements(By.XPATH, "./tbody/tr[starts-with(@style, 'font-weight: 600')]")

for row in rows:
    # Find country name and loan number from the row
    country_name = row.find_element(By.XPATH, ".//td[2]").text
    loan_info = row.find_element(By.XPATH, ".//td[3]").text
    country_data.append([country_name, loan_info])

# Create a DataFrame for country name and number of loans
df2 = pd.DataFrame(country_data, columns=["Country", "Number of Loans"])

#make the second variable numerical


df2['Number of Loans'] = df2['Number of Loans'].str.extract('(\d+)').astype(float) 

# Display the DataFrame
print(df2)




  df2['Number of Loans'] = df2['Number of Loans'].str.extract('(\d+)').astype(float)


                              Country  Number of Loans
0                              Angola             40.0
1                        South Africa              3.0
2                            Ethiopia             20.0
3                              Zambia             16.0
4                               Sudan             20.0
5                              Uganda              7.0
6                               Ghana             14.0
7                   Equatorial Guinea              8.0
8                               Kenya             16.0
9                       Côte d’Ivoire              5.0
10                             Guinea              3.0
11                           Zimbabwe              5.0
12                            Nigeria              3.0
13                           Tanzania              2.0
14  Congo, Democratic Republic of the              4.0
15                           Cameroon              6.0
16             Congo, Republic of the              4.0
17        

In [14]:
# To merge country names to the project data, we need a new list 

#  repeats the country names according to their corresponding loan numbers

# Create a new DataFrame with one column repeating the categories
repeated_categories = df2['Country'].repeat(df2['Number of Loans']).reset_index(drop=True)

# Convert to DataFrame
df3 = pd.DataFrame(repeated_categories, columns=['Country'])
df3['Loan_Type'] = 'Energy'

#lastly. merge the data to have the complete list of Energe loans 

merged_Energy = pd.concat([df, df3], axis=1)


#display result
print(merged_Energy)


    Country Project No.                                       Project Name  \
0                     1       Luanda Electrical Network Expansion, Phase I   
1                     2                        Lubango Electricity Network   
2                     3                     Namibe and Tombowa Electricity   
3                     4  Luanda Electrical Network Rehabilitation and E...   
4                     5       Quifangondo-Mabubas Transmission Line (60kV)   
..                  ...                                                ...   
202                   1                    Donsin Solar Power Plant (25MW)   
203                   1               Boali No.3 Hydropower Project (10MW)   
204                   2        Sakai Solar Photovoltaic Power Plant (15MW)   
205                   1  Addis-Djibouti Railway Electrification, Nagad-...   
206                   1            Financing for Central Electricity Board   

     Year  Lender Allocation (USD)                   Country Lo