# <span style="color:darkblue"> Project 1 Zeyuan Wang </span>

In [1]:
# import packages 


import pandas as pd
from lxml import html
# Work with time data
import time 

# Conduct HTTP requests
import requests

# Construct tree structure of HTML data
import html5lib

# Parse HTML data obtained from scraping
from bs4 import BeautifulSoup

# Import webdriver for chrome
from webdriver_manager.chrome import ChromeDriverManager



# Automate navigating within browser (SELENIUM)
#------ Key: Manage keys
#------ Select: Obtain features from website
#------ WebDriverWait: Add wait times implicitly
#------ By: Use common information locator strategies
#------ EC and Options: Browser configuration
#------ remote.command: Check whether browser is active

from selenium import webdriver #to automate the navigating within the browser
from selenium.webdriver.chrome.service import service
from selenium.webdriver.common.keys    import Keys
from selenium.webdriver.support.ui     import Select
from selenium.webdriver.support.ui     import WebDriverWait 
from selenium.webdriver.common.by      import By
from selenium.webdriver.support        import expected_conditions as EC
from selenium.webdriver.chrome.options import Options #to use properties of the chrome webbrowser
from selenium.webdriver.remote.command import Command # Use to check whether the web driver is active


In [2]:
#initialize web driver



# Open browser to start web scraping
options = webdriver.ChromeOptions()
options.headless = False 
driver = webdriver.Chrome(options=options)

# Navigate to specific website
starting_url = 'https://www.bu.edu/gdp/chinese-loans-to-africa-database/'
driver.get(starting_url)


In [3]:
#there is a button to be clicked on to get to the main data page , so we need website navigation

time.sleep(3)  # Adjust as necessary

# Find the button using its XPath
explore_button = driver.find_element(By.XPATH, "/html/body/div/div/div/div[4]/button")
    
explore_button.click()

# Wait to see the result
time.sleep(5) 


<font size = "5">
Scraping Algorithm

<font size = "4">
From this data, we want to extract data from the first and second table, which have project information on Chinese loans to Africa in the Energy and the Transportation sector. 

<font size = "4">
First, extract all information needed from the first table. 

In [10]:
# The table collapses rows, to find relevant the html codes and extract data, we need to expand the first table. 
# note: the website did not make this html interactable, but the table can be expanded munually in the web

#the following code does not work: 
# explore_button = driver.find_element(By.XPATH, "/html/body/div/div/div/div[4]/table[1]/thead/tr[3]/td[1]")
    
# explore_button.click()

#maybe there are other implicit ways of doing the interaction which I did not find


In [4]:

# now, we can find all rows with project information

# XPath for the first table
table_xpath = '/html/body/div/div/div/div[4]/table[1]'

# Extract table rows
rows = driver.find_elements(By.XPATH, table_xpath )

# Extract all project rows

project_rows = driver.find_elements(By.CLASS_NAME, "loanRow")


In [5]:
# Initialize list to store data
data = []

# Loop through the project rows and extract relevant information
for row in project_rows:
    cells = row.find_elements(By.TAG_NAME, "td")
    project_data = [cell.text for cell in cells]
    data.append(project_data)

# Define column names
columns = ["Country Project No.", "Project Name", "Year", "Lender", "Allocation (USD)"]

# Create a DataFrame with the extracted data
df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df)

    Country Project No.                                       Project Name  \
0                     1       Luanda Electrical Network Expansion, Phase I   
1                     2                        Lubango Electricity Network   
2                     3                     Namibe and Tombowa Electricity   
3                     4  Luanda Electrical Network Rehabilitation and E...   
4                     5       Quifangondo-Mabubas Transmission Line (60kV)   
..                  ...                                                ...   
202                   1                    Donsin Solar Power Plant (25MW)   
203                   1               Boali No.3 Hydropower Project (10MW)   
204                   2        Sakai Solar Photovoltaic Power Plant (15MW)   
205                   1  Addis-Djibouti Railway Electrification, Nagad-...   
206                   1            Financing for Central Electricity Board   

     Year  Lender Allocation (USD)  
0    2002  CHEXIM         

In [6]:
# the information of country name is also in the table, as the first row for each country. 
# we need to extract the information. 

# Locate the first table again
table_xpath = "/html/body/div/div/div/div[4]/table[1]"
table = driver.find_element(By.XPATH, table_xpath)

# Initialize a list to store country names and loan numbers
country_data = []

# Loop through the rows with country information
rows = table.find_elements(By.XPATH, "./tbody/tr[starts-with(@style, 'font-weight: 600')]")

for row in rows:
    # Find country name and loan number from the row
    country_name = row.find_element(By.XPATH, ".//td[2]").text
    loan_info = row.find_element(By.XPATH, ".//td[3]").text
    country_data.append([country_name, loan_info])

# Create a DataFrame for country name and number of loans
df2 = pd.DataFrame(country_data, columns=["Country", "Number of Loans"])

#make the second variable numerical


df2['Number of Loans'] = df2['Number of Loans'].str.extract('(\d+)').astype(float) 




  df2['Number of Loans'] = df2['Number of Loans'].str.extract('(\d+)').astype(float)


<font size = "4"> 

Merge Data and Create Data Frame


In [7]:
# To merge country names to the project data, we need a new list 

#  repeats the country names according to their corresponding loan numbers

# Create a new DataFrame with one column repeating the categories
repeated_categories = df2['Country'].repeat(df2['Number of Loans']).reset_index(drop=True)

# Convert to DataFrame
df3 = pd.DataFrame(repeated_categories, columns=['Country'])
df3['Loan_Type'] = 'Energy'

#lastly. merge the data to have the complete list of Energe loans 

merged_Energy = pd.concat([df, df3], axis=1)


#display result
print(merged_Energy)


    Country Project No.                                       Project Name  \
0                     1       Luanda Electrical Network Expansion, Phase I   
1                     2                        Lubango Electricity Network   
2                     3                     Namibe and Tombowa Electricity   
3                     4  Luanda Electrical Network Rehabilitation and E...   
4                     5       Quifangondo-Mabubas Transmission Line (60kV)   
..                  ...                                                ...   
202                   1                    Donsin Solar Power Plant (25MW)   
203                   1               Boali No.3 Hydropower Project (10MW)   
204                   2        Sakai Solar Photovoltaic Power Plant (15MW)   
205                   1  Addis-Djibouti Railway Electrification, Nagad-...   
206                   1            Financing for Central Electricity Board   

     Year  Lender Allocation (USD)                   Country Lo

<font size = "4">

To extract data for different items (from table 2), we turn to the second table, which involves projects in the transportation sector.

In [8]:
# Here, I basically repeat the scraping algorithm used in the first section 

# The table collapses rows, to find relevant the html codes and extract data, we need to again
#  expand the second table. This time, however, the author makes the code interactale.



explore_button = driver.find_element(By.XPATH, "/html/body/div/div/div/div[4]/table[2]/thead/tr[3]/td[1]")
    
explore_button.click()





In [9]:


# now, we can find all rows with project information

# XPath for the second table
table_xpath_2 = '/html/body/div/div/div/div[4]/table[2]'

# Extract table rows
rows = driver.find_elements(By.XPATH, table_xpath_2 )

# Extract all project rows

project_rows = driver.find_elements(By.CLASS_NAME, "loanRow")

# Initialize list to store data
data = []

# Loop through the project rows and extract relevant information
for row in project_rows:
    cells = row.find_elements(By.TAG_NAME, "td")
    project_data = [cell.text for cell in cells]
    data.append(project_data)

# Define column names
columns = ["Country Project No.", "Project Name", "Year", "Lender", "Allocation (USD)"]

# Create a DataFrame with the extracted data
df4 = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df4)


    Country Project No.                                       Project Name  \
0                     1  Luanda Railway, Rehabilitation, Phase I, Luand...   
1                     2                             Road Transport Support   
2                     3          Benguela Railway, Huambo Railway (1344km)   
3                     4         Kifangondo-Caxito-Uíge-Negage Road (371km)   
4                     5          Caxito-Nzeto Road, Rehabilitation (208km)   
..                  ...                                                ...   
331                   1      Roberts International Airport, Rehabilitation   
332                   1        Access Road to Mugere Hydroelectric Project   
333                   2                                MA60 Plane Purchase   
334                   1       Purchase of Y-12E Utility Aircraft (2 Units)   
335                   1  Moroni, Prince Said Ibrahim International Airport   

     Year  Lender Allocation (USD)  
0    2002  CHEXIM         

In [10]:
# the information of country name is also in the table, as the first row for each country. 
# we need to extract the information. 

# Locate the second table again
table_xpath = "/html/body/div/div/div/div[4]/table[2]"
table = driver.find_element(By.XPATH, table_xpath)

# Initialize a list to store country names and loan numbers
country_data = []

# Loop through the rows with country information
rows = table.find_elements(By.XPATH, "./tbody/tr[starts-with(@style, 'font-weight: 600')]")

for row in rows:
    # Find country name and loan number from the row
    country_name = row.find_element(By.XPATH, ".//td[2]").text
    loan_info = row.find_element(By.XPATH, ".//td[3]").text
    country_data.append([country_name, loan_info])

# Create a DataFrame for country name and number of loans
df5 = pd.DataFrame(country_data, columns=["Country", "Number of Loans"])

#make the second variable numerical


df5['Number of Loans'] = df5['Number of Loans'].str.extract('(\d+)').astype(float) 



# merge country names to the project data


# Create a new DataFrame with one column repeating the categories
repeated_categories = df5['Country'].repeat(df5['Number of Loans']).reset_index(drop=True)

# Convert to DataFrame
df6 = pd.DataFrame(repeated_categories, columns=['Country'])
df6['Loan_Type'] = 'Transportation'

#lastly. merge the data to have the complete list of Energe loans 

merged_Trans = pd.concat([df4, df6], axis=1)






  df5['Number of Loans'] = df5['Number of Loans'].str.extract('(\d+)').astype(float)


<font size = "4">

In the end, we merge all data for the first and the second table. 

In [11]:
merged_df = pd.concat([merged_Energy, merged_Trans], ignore_index=True)

#display result
print(merged_df)


    Country Project No.                                       Project Name  \
0                     1       Luanda Electrical Network Expansion, Phase I   
1                     2                        Lubango Electricity Network   
2                     3                     Namibe and Tombowa Electricity   
3                     4  Luanda Electrical Network Rehabilitation and E...   
4                     5       Quifangondo-Mabubas Transmission Line (60kV)   
..                  ...                                                ...   
538                   1      Roberts International Airport, Rehabilitation   
539                   1        Access Road to Mugere Hydroelectric Project   
540                   2                                MA60 Plane Purchase   
541                   1       Purchase of Y-12E Utility Aircraft (2 Units)   
542                   1  Moroni, Prince Said Ibrahim International Airport   

     Year  Lender Allocation (USD)     Country       Loan_Type 

<font size = "5">
Results

In [12]:
#quality check of observations and missing values and present table


summary_table = pd.DataFrame({
    'Total Observations': merged_df.count(),
    'Missing Values': merged_df.isnull().sum()
})

print(summary_table)

                     Total Observations  Missing Values
Country Project No.                 543               0
Project Name                        543               0
Year                                543               0
Lender                              543               0
Allocation (USD)                    543               0
Country                             543               0
Loan_Type                           543               0


In [13]:

#the main number in the data is the allocation amount, but we need to transform that into numerical variables



# Create a function to convert money string to numerical values
def money_to_num(money_str):
    if 'M' in money_str:
        return float(money_str.replace('$', '').replace('M', '')) * 1e6
    elif 'B' in money_str:
        return float(money_str.replace('$', '').replace('B', '')) * 1e9
    elif 'K' in money_str:
        return float(money_str.replace('$', '').replace('K', '')) * 1e3
    else:
        return float(money_str.replace('$', ''))

# Apply the function to the Amount column
merged_df['Amount_Numeric'] = merged_df['Allocation (USD)'].apply(money_to_num)

#calculate mean and variance

mean_value = merged_df['Amount_Numeric'] .mean()
variance_value = merged_df['Amount_Numeric'] .var()

print("Allocation Mean:", mean_value)
print("Allocation Variance:", variance_value)


Allocation Mean: 212508655.6169429
Allocation Variance: 2.9922518234191635e+17


<font size = "4">

Store dataset in a different file

In [25]:

merged_df.to_csv("project_1.csv")
