In [1]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np

# Add the project
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
raw_dir =Path(r"C:\Users\n740789\Documents\Projects_local\datasets\datafeeds\raw_dataset\2025")

In [30]:
govs_04_path = raw_dir / "20250401_Production" / "20250401_Governments_feed_permId.csv"
govs_05_path = raw_dir / "20250501_Production" / "20250501_Governments_feed_permId.csv"
govs_06_path = raw_dir / "20250601_Production" / "20250601_Governments_feed_permId.csv"

In [31]:
gov_04 = pd.read_csv(govs_04_path, dtype=str, low_memory=False)
gov_05 = pd.read_csv(govs_05_path, dtype=str, low_memory=False)
gov_06 = pd.read_csv(govs_06_path, dtype=str, low_memory=False)

In [32]:
df_list = [
    gov_04,
    gov_05,
    gov_06,
]

In [33]:
for df_name, df in zip(["gov_feed_04","gov_feed_05", "gov_feed_06"],df_list):
    # normalize column names
    df.columns = [col.strip().lower().replace(" ","_") for col in df.columns]
    # replace "-" with NaN
    df['isin'].replace("-", np.nan, inplace=True)
    # print number of empyt ISINs
    empty_isin_count = df['isin'].isna().sum()
    print(f"\nNumber of empty ISINs in {df_name}: {empty_isin_count}\n")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['isin'].replace("-", np.nan, inplace=True)



Number of empty ISINs in gov_feed_04: 60


Number of empty ISINs in gov_feed_05: 60


Number of empty ISINs in gov_feed_06: 59



In [40]:
for df in df_list:
    print(df.shape[0])
    #print("\nempty isins\n")
    ## head df
    #print(df[df['isin'].isna()].head(10))

222880
223126
224148


In [43]:
# empty isins gov_04
gov_04[gov_04["isin"].notna()][["isin","permid","government"]].sort_values(by="government").head(60)

Unnamed: 0,isin,permid,government
220548,CH0353428052,4296321571,Aargau
220542,CH0305954528,4296321571,Aargau
220553,CH0017340024,4296321571,Aargau
220552,CH0011566004,4296321571,Aargau
220551,CH0008283787,4296321571,Aargau
220550,CH0373476396,4296321571,Aargau
220549,CH0119858659,4296321571,Aargau
220543,CH0353428045,4296321571,Aargau
220547,CH0305954569,4296321571,Aargau
220546,CH0116453322,4296321571,Aargau


In [47]:
countries_empty_isin_04 = gov_04[gov_04["isin"].notna()]["government"].unique().tolist()
countries_empty_isin_05 = gov_05[gov_05["isin"].notna()]["government"].unique().tolist()
countries_empty_isin_06 = gov_06[gov_06["isin"].notna()]["government"].unique().tolist()


In [46]:
for country in countries_empty_isin_04:
    country_rows = gov_04[gov_04['government'] == country].shape[0]
    if country_rows < 2:
        print(f"{country}: {country_rows}")

Schwyz: 1
Maldives: 1
Saint Vincent and the Grenadines: 1
Chad: 1


In [48]:
empty_country_isin_list = [
    countries_empty_isin_04,
    countries_empty_isin_05,
    countries_empty_isin_06,
]

In [49]:
for cl, df in zip(empty_country_isin_list, df_list):
    for country in cl:
        country_rows = df[df['government'] == country].shape[0]
        if country_rows < 2:
            print(f"{country}: {country_rows}")

Schwyz: 1
Maldives: 1
Saint Vincent and the Grenadines: 1
Chad: 1
Chad: 1
Maldives: 1
Schwyz: 1
Saint Vincent and the Grenadines: 1
Chad: 1
Schwyz: 1
Valais: 1
Maldives: 1
Saint Vincent and the Grenadines: 1


In [11]:
gov_04_missing_isin = gov_04[gov_04['isin'] =="-"].copy()

In [12]:
gov_04_missing_isin

Unnamed: 0,isin,government,country_of_subnational,economic_peer_group\n(market_type),geographical_peer_group\n(granular),geographical_peer_group\n(high_level),economic_peer_group,type_of_government,e_score,s_score,g_score,esg_score,relevance,esg_rating,sustainability_rating:_within_government_type,sustainability_rating:_within_government_type_+_economic_peer_group,sustainability_rating:_within_government_type_+_geography,security_name,clarityid,permid
222820,-,Afghanistan,-,-,South Asia,Southeast Asia & Pacific,Low-income,National,51,15,22,28,93,C,Poor,Limited,Poor,-,159005547,5000891033
222821,-,Bhutan,-,-,South Asia,Southeast Asia & Pacific,Lower-middle-income,National,49,41,51,48,87,B,Average,Outstanding,Average,-,154650409,5001078058
222822,-,Brunei,-,-,East Asia & Pacific,Southeast Asia & Pacific,High-income,National,50,57,61,57,77,A-,Good,Sufficient,Good,-,163372987,5001426944
222823,-,Burundi,-,-,Sub-Saharan Africa,Middle East & Africa,Low-income,National,45,29,24,31,92,C,Poor,Sufficient,Limited,-,162725989,5000697990
222824,-,Cambodia,-,-,East Asia & Pacific,Southeast Asia & Pacific,Lower-middle-income,National,43,43,35,39,97,C+,Sufficient,Sufficient,Limited,-,161831120,4296404976
222825,-,Central African Republic,-,-,Sub-Saharan Africa,Middle East & Africa,Low-income,National,53,20,24,30,90,C,Poor,Sufficient,Limited,-,166161056,4296996354
222826,-,Comoros,-,-,Sub-Saharan Africa,Middle East & Africa,Low-income,National,50,26,33,36,79,C+,Limited,Average,Sufficient,-,169202356,5039193927
222827,-,Cuba,-,-,Latin America & Caribbean,Latin America & Caribbean,Upper-middle-income,National,52,53,27,40,83,C+,Sufficient,Limited,Limited,-,143407,4296846057
222828,-,Democratic Republic of the Congo,-,-,Sub-Saharan Africa,Middle East & Africa,Low-income,National,49,28,26,32,97,C,Poor,Sufficient,Sufficient,-,143513,4297238542
222829,-,Djibouti,-,-,Middle East & North Africa,Middle East & Africa,Lower-middle-income,National,43,25,32,33,86,C,Limited,Poor,Sufficient,-,159005551,5000891277


In [18]:
# replace "-" with nan in column isin
gov_04['isin'].replace("-", np.nan, inplace=True)

In [19]:
gov_04.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222880 entries, 0 to 222879
Data columns (total 20 columns):
 #   Column                                                               Non-Null Count   Dtype 
---  ------                                                               --------------   ----- 
 0   isin                                                                 222820 non-null  object
 1   government                                                           222880 non-null  object
 2   country_of_subnational                                               222880 non-null  object
 3   economic_peer_group
(market_type)                                    222880 non-null  object
 4   geographical_peer_group
(granular)                                   222880 non-null  object
 5   geographical_peer_group
(high_level)                                 222880 non-null  object
 6   economic_peer_group                                                  222880 non-null  object
 7   ty

In [20]:
print(gov_04['isin'].value_counts(dropna=False).head(10))

isin
NaN             60
KRC0350C4096     1
KRC035FP2732     1
KRC035AP1895     1
KRC0350C2769     1
USP68788AD37     1
SM000A3K4YC0     1
SM000A3KUWQ0     1
SM000A3LP8W0     1
XS2239061927     1
Name: count, dtype: int64


In [None]:
gov_04[gov_04['isin'].isna()][]