### SDN List : Data Processing

In [1]:
import pandas as pd
import json
import re

file_path = '../original/sdn.csv'
df1 = pd.read_csv(file_path)
print(df1.columns)
df1.head()

Index(['ent_num', 'SDN_name', 'SDN_type', 'Program', 'Title', 'Call_Sign',
       'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner', 'Remarks'],
      dtype='object')


Unnamed: 0,ent_num,SDN_name,SDN_type,Program,Title,Call_Sign,Vess_type,Tonnage,GRT,Vess_flag,Vess_owner,Remarks
0,36,AEROCARIBBEAN AIRLINES,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
1,173,"ANGLO-CARIBBEAN CO., LTD.",-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
2,306,BANCO NACIONAL DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,a.k.a. 'BNC'.
3,424,BOUTIQUE LA MAISON,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
4,475,CASA DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-


In [2]:
# COUNTRY

# read add.csv which is the helper file to add the country column to sdn.csv
add_file_path = '../original/add.csv'
df2 = pd.read_csv(add_file_path)
df2 = df2[['ent_num', 'country']]
if 'country' in df1.columns:
    df1.drop(columns=['country'], inplace=True)

# add a new column country in sdn.csv and add the values from add.csv when ent_num matches then concatanate the values
merged_df = pd.merge(df1, df2[['ent_num', 'country']], on='ent_num', how='left')
grouped_df = merged_df.groupby('ent_num')['country'].apply(lambda x: ';'.join(x.dropna())).reset_index()
result_df = pd.merge(df1, grouped_df, on='ent_num', how='left')
result_df.columns


Index(['ent_num', 'SDN_name', 'SDN_type', 'Program', 'Title', 'Call_Sign',
       'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner', 'Remarks',
       'country'],
      dtype='object')

In [3]:
col = ['Program', 'Title', 'Call_Sign',
       'Vess_type', 'Tonnage', 'GRT', 'Vess_flag', 'Vess_owner']
columns_to_drop = [col_name for col_name in col if col_name in result_df.columns]
result_df.drop(columns=columns_to_drop, inplace=True)
result_df['SDN_type'].value_counts()

-0-           7252
individual    6915
vessel         861
aircraft       374
Name: SDN_type, dtype: int64

In [4]:

# ETH ADDRESSES  
# TODO : Get bitcoin and othe addresses as well                    
pattern = r'ETH\s+(0x[0-9a-fA-F]{40})(?=[\s;])'
def extract_eth_addresses(remark):
    if isinstance(remark, str):
        return re.findall(pattern, remark)
    return []

eth_addresses = result_df['Remarks'].apply(extract_eth_addresses).explode().dropna().tolist()
eth_addresses_dict = [{'Eth_address': addr} for addr in eth_addresses]
print(len(eth_addresses))
json_result = json.dumps(eth_addresses_dict, indent=4)
with open('eth_addresses.json', 'w') as f:
    f.write(json_result)


49


In [5]:
result_df = result_df[result_df['SDN_type'] == 'individual']
print("Cleaned",result_df['SDN_type'].value_counts())
result_df.drop(columns="SDN_type", inplace=True)
result_df.head()

Cleaned individual    6915
Name: SDN_type, dtype: int64


Unnamed: 0,ent_num,SDN_name,Remarks,country
53,2674,"ABBAS, Abu",DOB 10 Dec 1948; Director of PALESTINE LIBERAT...,-0-
54,2675,"AL RAHMAN, Shaykh Umar Abd",DOB 03 May 1938; POB Egypt; Chief Ideological ...,-0-
55,2676,"AL ZAWAHIRI, Dr. Ayman","DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...",-0-
56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan","DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...",Egypt
57,2678,"AWDA, Abd Al Aziz",DOB 1946; Chief Ideological Figure of PALESTIN...,-0-


Individual have proper names and some vessels are right name, but most of them are not. Aircrafts seem to be aircraft manufacturers and have codes in names, hence not of any use. -0- seems to be names of company or groups, hence disregarded.

In [6]:
len(result_df) #total individuals

6915

In [7]:
# DOB

# for format dd mmm yyyy
result_df['DOB'] = result_df['Remarks'].str.extract(r'(\d{2} \w{3} \d{4})')
result_df['day'] = result_df['DOB'].str.extract(r'(\d{2})')
result_df['month'] = result_df['DOB'].str.extract(r'(\w{3})')
result_df['year'] = result_df['DOB'].str.extract(r'(\d{4})')
# for yyyy only format
result_df['year'] = result_df['Remarks'].str.extract(r'(\d{4})')
result_df.head()

print(result_df['year'].count()) # total individuals with at least year in dob
print(result_df['DOB'].count()) # total individuals with whole dob


6855
5924


In [8]:
# GENDER
def extract_gender(text):
    pattern = r'gender (male|female)'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return None
    
# Apply the function to extract the gender
result_df["Gender"] = result_df['Remarks'].apply(extract_gender)
result_df.head()
result_df["Gender"].count() 
# result_df[result_df['ent_num'] == "12610"]

0

In [9]:
def extract_nationality(remark):
    pattern = r'nationality ([A-Za-z]+);'
    match = re.search(pattern, remark)
    if match:
        return match.group(1)
    else:
        return None

def extract_citizen(remark):
    pattern = r'citizen ([A-Za-z]+);'
    match = re.search(pattern, remark)
    if match:
        return match.group(1)
    else:
        return None


# Apply the extract_nationality function to the 'remarks' column
result_df['Citizen'] = result_df['Remarks'].apply(extract_citizen)
result_df['Nationality'] = result_df['Remarks'].apply(extract_nationality)
result_df['Nationality'].count()

filtered_df = result_df.dropna(subset=['Citizen', 'Nationality'])
diff_values_df = filtered_df[filtered_df['Citizen'] != filtered_df['Nationality']]
count_diff_values = diff_values_df.shape[0]
print(count_diff_values) # 20 instances where in remark both citizen <country1> and nationality <country2> are mentioned, hence seperated
result_df.head(5)

20


Unnamed: 0,ent_num,SDN_name,Remarks,country,DOB,day,month,year,Gender,Citizen,Nationality
53,2674,"ABBAS, Abu",DOB 10 Dec 1948; Director of PALESTINE LIBERAT...,-0-,10 Dec 1948,10.0,Dec,1948,,,
54,2675,"AL RAHMAN, Shaykh Umar Abd",DOB 03 May 1938; POB Egypt; Chief Ideological ...,-0-,03 May 1938,3.0,May,1938,,,
55,2676,"AL ZAWAHIRI, Dr. Ayman","DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...",-0-,19 Jun 1951,19.0,Jun,1951,,,
56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan","DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...",Egypt,19 Apr 1947,19.0,Apr,1947,,,Egypt
57,2678,"AWDA, Abd Al Aziz",DOB 1946; Chief Ideological Figure of PALESTIN...,-0-,,,,1946,,,


In [10]:
result_df['Remarks'] = result_df['Remarks'].str.lower()
pattern = r'passport ([^\(]+) \(([^)]+)\)'

def extract_passport_info(remark):
    match = re.search(pattern, remark)
    if match:
        return match.group(1), match.group(2)
    else:
        return None, None


result_df[['Pass_No', 'Pass_Country']] = result_df['Remarks'].apply(lambda x: pd.Series(extract_passport_info(x)))
result_df['Pass_No'].count() # total individuals with passport number

1549

In [11]:
result_df.head(10)

Unnamed: 0,ent_num,SDN_name,Remarks,country,DOB,day,month,year,Gender,Citizen,Nationality,Pass_No,Pass_Country
53,2674,"ABBAS, Abu",dob 10 dec 1948; director of palestine liberat...,-0-,10 Dec 1948,10.0,Dec,1948,,,,,
54,2675,"AL RAHMAN, Shaykh Umar Abd",dob 03 may 1938; pob egypt; chief ideological ...,-0-,03 May 1938,3.0,May,1938,,,,,
55,2676,"AL ZAWAHIRI, Dr. Ayman","dob 19 jun 1951; pob giza, egypt; passport 108...",-0-,19 Jun 1951,19.0,Jun,1951,,,,1084010.0,egypt
56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan","dob 19 apr 1947; pob nahia, giza, egypt; natio...",Egypt,19 Apr 1947,19.0,Apr,1947,,,Egypt,,
57,2678,"AWDA, Abd Al Aziz",dob 1946; chief ideological figure of palestin...,-0-,,,,1946,,,,,
58,2679,"FADLALLAH, Shaykh Muhammad Husayn",dob 1938; alt. dob 1936; pob najf al ashraf (n...,-0-,,,,1938,,,,,
59,2681,"HAWATMA, Nayif",dob 1933; secretary general of democratic fron...,-0-,,,,1933,,,,,
60,2682,"ISLAMBOULI, Mohammad Shawqi",dob 15 jan 1955; pob egypt; passport 304555 (e...,-0-,15 Jan 1955,15.0,Jan,1955,,,,304555.0,egypt
61,2683,"JABRIL, Ahmad","dob 1938; pob ramleh, israel; secretary genera...",-0-,,,,1938,,,,,
62,2685,"NAJI, Talal Muhammad Rashid","dob 1930; pob al nasiria, palestine; principal...",-0-,,,,1930,,,,,


In [12]:
result_df.columns

Index(['ent_num', 'SDN_name', 'Remarks', 'country', 'DOB', 'day', 'month',
       'year', 'Gender', 'Citizen', 'Nationality', 'Pass_No', 'Pass_Country'],
      dtype='object')

In [13]:
# the sdn_name column has in the format "LAST NAME, first name", so split it up into 2 columns if you find the comma in the string
result_df[['Last_Name', 'First_Name']] = result_df['SDN_name'].str.split(', ', expand=True, n=1)
result_df['Last_Name'] = result_df['Last_Name'].str.upper()
result_df['First_Name'] = result_df['First_Name'].str.upper()
result_df.head()

Unnamed: 0,ent_num,SDN_name,Remarks,country,DOB,day,month,year,Gender,Citizen,Nationality,Pass_No,Pass_Country,Last_Name,First_Name
53,2674,"ABBAS, Abu",dob 10 dec 1948; director of palestine liberat...,-0-,10 Dec 1948,10.0,Dec,1948,,,,,,ABBAS,ABU
54,2675,"AL RAHMAN, Shaykh Umar Abd",dob 03 may 1938; pob egypt; chief ideological ...,-0-,03 May 1938,3.0,May,1938,,,,,,AL RAHMAN,SHAYKH UMAR ABD
55,2676,"AL ZAWAHIRI, Dr. Ayman","dob 19 jun 1951; pob giza, egypt; passport 108...",-0-,19 Jun 1951,19.0,Jun,1951,,,,1084010.0,egypt,AL ZAWAHIRI,DR. AYMAN
56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan","dob 19 apr 1947; pob nahia, giza, egypt; natio...",Egypt,19 Apr 1947,19.0,Apr,1947,,,Egypt,,,AL-ZOMOR,ABBOUD ABDUL LATIF HASSAN
57,2678,"AWDA, Abd Al Aziz",dob 1946; chief ideological figure of palestin...,-0-,,,,1946,,,,,,AWDA,ABD AL AZIZ


In [14]:
result_df.to_csv('cleaned_sdn.csv', index=False)
result_df = result_df.where(pd.notnull(result_df), None)

filtered_df = result_df.dropna(subset=['Pass_No', 'Pass_Country'], how='all')
passport_df = filtered_df[['Pass_No', 'Pass_Country']]
name_dob = result_df[['First_Name', 'Last_Name', 'day', 'month', 'year']]

passport_list = passport_df.to_dict(orient='records')
with open('passports.json', 'w') as json_file:
    json.dump(passport_list, json_file, indent=4)
    
name_list = name_dob.to_dict(orient='records')
with open('names.json', 'w') as json_file:
    json.dump(name_list, json_file, indent=4)