In [None]:
# import libraries
from datetime import date
import numpy as np
import requests
from bs4 import BeautifulSoup
import lxml
import re
import pandas as pd
import os
import io
from datetime import datetime, timedelta
import functions # specific module for additional functions for this code
from glob import glob

In [None]:
# I'm going to read the html file, I do prefer it with respect to xml
url = 'https://docs.fcdo.gov.uk/docs/UK-Sanctions-List.html'
response = requests.get(url)
html = response.text

In [None]:
# split the whole html text in a string list by entity id 
string_html = html.replace('\n', ' ').strip()
pattern = r'(Unique ID)'
string_w_separator = re.sub(pattern, r'## \1', string_html)
list_html = string_w_separator.split('##')
# filter blank spaces
list_html = [x.strip() for x in list_html if x.strip()]

In [None]:
# now i loop to extract the info from each html entity id cube
# each of them has three sections: 
# 1. basic info on sanctioned subject
# 2. names and aliases of sanctioned subject
# 3. addresses and countries of sanctioned subject

# I know that the first occurrence of list_html is useless because it contains only file generartion info
r = len(list_html)-1

# I define a counter, so as to know the status of advancement of the code
counter = 0

dfs_basic_info = []
dfs_names = []
dfs_addresses = []
for i in range(1,len(list_html)):
    
    soup = BeautifulSoup(list_html[i], 'lxml')
    
    # DataFrame for basic info
    unique_id = soup.find('span')
    # the counter advances on each loop
    counter += 1          
    counter_message = f"{unique_id.text.strip()} - status {counter}/{r}"
    print(counter_message, end='\r') 
    
    subject_type = unique_id.find_next('span').text.strip()
    regime_name = soup.find('b', string=re.compile('Regime Name:', re.IGNORECASE)).find_next('span').text.strip()
    try:
        sanction_imposed = soup.find('b', string=re.compile('Sanctions Imposed:', re.IGNORECASE)).find_next('span').text.strip()
    except:
        sanction_imposed = None
    
    designation_source = soup.find('b', string=re.compile('Designation Source:', re.IGNORECASE)).find_next('span').text.strip()
    date_designed = soup.find('b', string=re.compile('Date Designated:', re.IGNORECASE)).find_next('span').text.strip()
    try:
        ofsi_grp = soup.find('b', string=re.compile('OFSI Group ID:', re.IGNORECASE)).find_next('span').text.strip()
    except:
        ofsi_grp = None
    try:
        un_ref = soup.find('b', string=re.compile('UN Reference Number:', re.IGNORECASE)).find_next('span').text.strip()
    except:
        un_ref = None
        
    data_basic_info = {
                    'sanctioned_id': unique_id.text.strip(),
                    'sanctioned_type': subject_type,
                    'sanction_listing_date': date_designed,
                    'sanction_text': regime_name,
                    'sanction_imposed': sanction_imposed,
                    'sanctioned_ofsi_grp': ofsi_grp,
                    'sanctioned_un_ref': un_ref,
                    'sanction_body': 'UK'
    }

    # DataFrame for all aliases
    df_basic_info = pd.DataFrame(data_basic_info, index=[0])
    dfs_basic_info.append(df_basic_info)
    names = soup.find_all('b', text=' Name: ')
    for n in names:
        try:
            name_text = n.find_next('span').get_text(strip=True)
        except:
            name_text = None
        try:
            name_type = n.find_next('b', text = ' Name Type: ').find_next('span').get_text(strip=True)
        except:
            name_type = None
        
        sanctioned_id = unique_id.get_text(strip=True)
        dfs_names.append({'sanctioned_id': sanctioned_id, 'sanctioned_alias': name_text, 'sanctioned_alias_type': name_type}) 
    
    # DataFrame for all addresses
    addresses = soup.find_all('b', text=' Address: ')
    for a in addresses:
        try:
            add_txt = a.find_next('span').get_text(strip=True)
        except:
            add_txt = None
        try:
            add_country = a.find_next('b', text = ' Address Country: ').find_next('span').get_text(strip=True)
        except:
            add_country = None
        
        sanctioned_id = unique_id.get_text(strip=True)
        dfs_addresses.append({'sanctioned_id': sanctioned_id, 'sanctioned_address': add_txt, 'sanctioned_country': add_country}) 
        
        
df_basic_info = pd.concat(dfs_basic_info, ignore_index=True)
df_names = pd.DataFrame(dfs_names)
df_addresses = pd.DataFrame(dfs_addresses)

In [None]:
# a little bit of data amnipulation
df_basic_info['sanctioned_type'] = df_basic_info['sanctioned_type'].str.replace('- ', '')
df_basic_info['sanction_listing_date'] = pd.to_datetime(df_basic_info['sanction_listing_date'], format='%d/%m/%Y')

# I only need to merge the addresses DataFrame, because it contains the 'sanctioned_country'
df_final = pd.merge(df_basic_info, df_addresses, on='sanctioned_id', how='outer')

df_uk_analysis = df_final[['sanctioned_id', 'sanctioned_country', 'sanctioned_type', 'sanction_text', 'sanction_listing_date', 'sanction_body']]
df_uk_analysis = df_uk_analysis.drop_duplicates()
df_uk_analysis["sanctioned_country_iso3"] = df_uk_analysis["sanctioned_country"].apply(functions.descr_to_iso3)

# I had to add some adjustemts beacuse the file has invalid names for the lookup function 
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('Russia', 'RUS')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('Congo (Democratic Republic)', 'COD')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('DPRK', 'PRK')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('Kosovo', 'XKX')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('Occupied Palestinian Territories', 'PSE')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('The Gambia', 'GMB')
df_uk_analysis['sanctioned_country_iso3'] = df_uk_analysis['sanctioned_country_iso3'].str.replace('Turkey', 'TUR')

In [None]:
# this cell saves the DataFrame on your google drive (I use Google colab). You can just save he DataFrame on your local directory as well
from google.colab import drive
drive.mount('/content/drive')
df_uk_analysis.to_csv('/content/drive/My Drive/df_uk_analysis.csv', sep=';', index=False)

In [None]:
df_uk_analysis.to_csv('C:/Users/valer/OneDrive/Desktop/python/input/df_uk_analysis.csv', sep=';', index=False)