In [2]:
import re
import pandas as pd

from fuzzywuzzy import fuzz


from collections import defaultdict 

In [3]:
f1 = 'D:\\projects\\_external_files\\cod_files\\afg_adminboundaries_tabulardata.xlsx'
f2 = 'D:\\projects\\_external_files\\cod_files\\ukr_adminboundaries_tabulardata.xlsx'
f3 = 'D:\\projects\\_external_files\\cod_files\\tur_adminboundaries_tabulardata.xlsx'
f4 = 'D:\\projects\\_external_files\\cod_files\\syr_adminboundaries_tabulardata.xlsx'

files = [f1, f2, f3, f4]

In [4]:
def get_all_cols_for_level(df, lvl=0):

    return_df = pd.DataFrame(columns=['pcode','adm','location_name','adm_lvl'])
    #lvl_patterns={}

    while lvl >= 0:
        column_levels = [c for c in df.columns if str(lvl) in c]
        if len(column_levels) == 0:
            lvl -= 1
            continue

        
        pcodes = []
        names = []
        others = []
    
        for col in column_levels:
            print(col)
            pcode_match = re.match(r'(adm)(\d+)_pcode', col.lower())
            name_match = re.match(r'(adm)(\d+)_([a-z]{2,3})', col.lower())
            alt_match = re.match(r'(adm)(\d+)(alt)(\d+)_([a-z]{2,3})', col.lower())
            if pcode_match:
                pcodes.append(pcode_match.group(0))
            elif name_match:
                names.append(name_match.group(0))
            elif alt_match:
                #throw away alts for now
                pass
            else:
                others.append(col)
    
        #sanity checks
        if len(pcodes) != 1:
            print(f"unexpected condition - pcodes len should be 1. {pcodes}")
            print(pcodes)
        if len(names) < 1:
            print(f"unexpected condition - names len should be at least 1. {names}")
        if len(others) > 0:
            print(f"unexpected condition, but not fatal - others len should be empty. {others}")

    
        required_columns = names
        required_columns.extend(pcodes)
        
    
        # Melting the DataFrame to combine columns into rows
        melted_df = df[required_columns].melt(id_vars=pcodes[0], var_name='adm', value_name='location_name').copy()
        melted_df['adm_lvl'] = lvl
        melted_df = melted_df.rename(columns={pcodes[0]:'pcode'})


        return_df = pd.concat([return_df, melted_df])

        lvl -= 1

    #print(lvl_patterns)
    return_df['pcode_prefix'] = return_df['pcode'].apply(lambda x: x[0:2])
    
    return_df['lang_code'] = return_df['adm'].apply(lambda x: x.split('_')[1])
    country=''
    country = return_df['location_name'][(return_df['adm_lvl'] == 0) & (return_df['lang_code'] == 'en')].tolist()[0]
    return_df['country'] = country
    return_df = return_df.drop_duplicates().reset_index()
    return_df = return_df[return_df['location_name'].isna() == False]

    #remove common variations in names that can cause misses
    return_df['location_normalized'] = return_df['location_name'].str.lower()
    return_df['location_normalized'] = return_df['location_normalized'].apply(lambda x:  re.sub(r'[^a-zA-Z]', '', x))
                                                                              
                                                

    
    return return_df[['country','pcode_prefix','location_name','location_normalized','pcode','adm_lvl','lang_code']]



In [5]:
def standardize_column_names(df, case='lower'):
    cols = df.columns
    new_columns={}
    if case == 'lower':
        for c in cols:
            new_columns[c] = c.lower()
    elif case == 'upper':
        for c in cols:
            new_columns[c] = c.upper()

    return df.rename(columns=new_columns)
    


    

In [6]:
def process_cods(f):
    preferred_level = 'ADM3'
    backup_level = 'ADM2'

    # Create an ExcelFile object
    xls = pd.ExcelFile(f)
    sheet_names = xls.sheet_names

    if preferred_level in sheet_names:
        df = pd.read_excel(xls, sheet_name=preferred_level)
    else:
        df = pd.read_excel(xls, sheet_name=backup_level)

    return df


        

In [7]:
df_location = pd.DataFrame(columns=['country','pcode_prefix','location_name','pcode','adm_lvl','lang_code'])
for f in files:
    print(f)

    #try to access the ADM3 tab and load to a df
    df = process_cods(f)
    df = standardize_column_names(df)
    df_new_loc = get_all_cols_for_level(df, lvl=4)


    df_location = pd.concat([df_location, df_new_loc])


    print(df_location.shape)

df_location


D:\projects\_external_files\cod_files\afg_adminboundaries_tabulardata.xlsx
adm2_en
adm2_da
adm2_pcode
adm2_ref
adm2alt1_en
adm2alt2_en
adm2alt1_da
adm2alt2_da
adm2alt1_en
adm2alt1_da
adm1_en
adm1_da
adm1_pcode
adm0_en
adm0_da
adm0_pcode
(872, 7)
D:\projects\_external_files\cod_files\ukr_adminboundaries_tabulardata.xlsx
adm3_en
adm3_ua
adm3_ru
adm3_pcode
adm3_ref
adm3alt1_en
adm3alt2_en
adm3alt1_ua
adm3alt2_ua
adm3alt1_ru
adm3alt2_ru
adm3alt2_en
adm3alt2_ua
adm3alt2_ru
adm2_en
adm2_ua
adm2_ru
adm2_pcode
adm3alt1_en
adm3alt1_ua
adm3alt1_ru
adm1_en
adm1_ua
adm1_ru
adm1_pcode
adm0_en
adm0_ua
adm0_ru
adm0_pcode
(6680, 7)
D:\projects\_external_files\cod_files\tur_adminboundaries_tabulardata.xlsx
adm2_tr
adm2_en
adm2_pcode
adm1_tr
adm1_en
adm1_pcode
adm0_tr
adm0_en
adm0_pcode
(8790, 7)
D:\projects\_external_files\cod_files\syr_adminboundaries_tabulardata.xlsx
adm3_en
adm3_ar
adm3_pcode
admin3refname_en
unexpected condition, but not fatal - others len should be empty. ['admin3refname_en']
adm2

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized
0,Afghanistan,AF,Khash,AF1705,2,en,khash
1,Afghanistan,AF,Hazrat-e-Sultan,AF2002,2,en,hazratesultan
2,Afghanistan,AF,Pul-e-Alam,AF0501,2,en,pulealam
3,Afghanistan,AF,Mata Khan,AF1202,2,en,matakhan
4,Afghanistan,AF,Gosfandi,AF2206,2,en,gosfandi
...,...,...,...,...,...,...,...
693,Syrian Arab Republic,SY,ريف دمشق,SY03,1,ar,
694,Syrian Arab Republic,SY,طرطوس,SY10,1,ar,
695,Syrian Arab Republic,SY,دمشق,SY01,1,ar,
696,Syrian Arab Republic,SY,Syrian Arab Republic,SY,0,en,syrianarabrepublic


In [8]:
df_location['lvl_pcode_len'] = df_location['pcode'].apply(lambda x:len(x))
df_pcode_struct = df_location[['pcode_prefix','adm_lvl','lvl_pcode_len']].copy().drop_duplicates().reset_index(drop=True)
pivot_df = df_pcode_struct.pivot(index='pcode_prefix', columns='adm_lvl', values='lvl_pcode_len')
segment_total_len = [int(i) for i in pivot_df.loc['AF'].tolist() if i>0]
segment_total_len

def split_pcode_struct(row):
    lvl=row['adm_lvl']
    pcode=row['pcode']
    c_code = row['pcode_prefix']
    segment_total_len = [int(i) for i in pivot_df.loc[c_code].tolist() if i>0]
    pcode_list = pcode.split()
    last_i=0
    newpcode=''
    for i in segment_total_len:
        x = pcode[last_i:i] 
        last_i = i
        if i <= len(pcode): #make sure the segment length doesn't exceed the length of the pcode
            newpcode = newpcode + x + '.'

    return newpcode[0:-1]

df_location['split_pcode'] = df_location.apply(split_pcode_struct, axis=1)
df_location

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode
0,Afghanistan,AF,Khash,AF1705,2,en,khash,6,AF.17.05
1,Afghanistan,AF,Hazrat-e-Sultan,AF2002,2,en,hazratesultan,6,AF.20.02
2,Afghanistan,AF,Pul-e-Alam,AF0501,2,en,pulealam,6,AF.05.01
3,Afghanistan,AF,Mata Khan,AF1202,2,en,matakhan,6,AF.12.02
4,Afghanistan,AF,Gosfandi,AF2206,2,en,gosfandi,6,AF.22.06
...,...,...,...,...,...,...,...,...,...
693,Syrian Arab Republic,SY,ريف دمشق,SY03,1,ar,,4,SY.03
694,Syrian Arab Republic,SY,طرطوس,SY10,1,ar,,4,SY.10
695,Syrian Arab Republic,SY,دمشق,SY01,1,ar,,4,SY.01
696,Syrian Arab Republic,SY,Syrian Arab Republic,SY,0,en,syrianarabrepublic,2,SY


In [9]:
df_location.to_csv("c://temp//locations.csv", encoding='utf-8-sig', index=False)
df_location.to_csv("D://projects//_external_files//cod_files//combined_locations//locations.csv", encoding='utf-8-sig', index=False)






# Now use the DF

In [10]:
def get_pcode_from_location(loc, country_prefix='XX'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix']]
    else:
        df_loc = df_location
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == loc.lower()].tolist()
    matches = list(set(matches))

    #if the match fails, try again on the normalized name
    if len(matches) == 0:
        #remove common variations in names that can cause misses
        n_loc = re.sub(r'[^a-zA-Z]', '', loc)

        #this will cause problems for non-English.. so if then len is 0, exit
        if len(n_loc) == 0:
            return []
            
        matches = df_loc['pcode'][df_loc['location_normalized'].str.lower() == n_loc.lower()].tolist()

        
        

    #now check results
    if len(matches) > 1:
        print(f"more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) {matches}")
        return matches
    elif len(matches) == 1:
        return matches

    else:
        #couldn't find a match, do a fuzzy search
        compare_list = list(set(df_loc['location_name'].tolist()))
        possible_matches=[]
        for i in compare_list:
            if fuzz.ratio(loc,i) > 70:
                possible_matches.append(i)
                return [f"No exact match to '{loc}'. see if these alternative spellings are correct: {possible_matches}"]

    
    return []
                
        

#different scripts
script_check = ['Sharak-e-Hayratan','حضرت سلطان','Кальчикская','Tekirdagi','homs']
for r in script_check:
    res = get_pcode_from_location(r)
    print(f"searched for {r:<20} ->   {res}")

print()

#afghan references
afg_refs = ['Aliabad','Injil','Gulran','zindajan','kohsan'
            ,'Shakiban','Nazir Abad','Sanjab'
            ,'nazir','Ghar Moshak','Botan'
           ,'Hirat']
for r in afg_refs:
    res = get_pcode_from_location(r)
    print(f"searched for {r:20} : -> {res}")

searched for Sharak-e-Hayratan    ->   ['AF2116']
searched for حضرت سلطان           ->   ['AF2002']
searched for Кальчикская          ->   ['UA1414001']
searched for Tekirdagi            ->   []
more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['SY04', 'SY040100', 'SY0401']
searched for homs                 ->   ['SY04', 'SY040100', 'SY0401']

searched for Aliabad              : -> ['AF1903']
searched for Injil                : -> ['AF3202']
searched for Gulran               : -> ['AF3208']
searched for zindajan             : -> ['AF3205']
searched for kohsan               : -> ['AF3213']
searched for Shakiban             : -> ["No exact match to 'Shakiban'. see if these alternative spellings are correct: ['Thiban']"]
searched for Nazir Abad           : -> ["No exact match to 'Nazir Abad'. see if these alternative spellings are correct: ['Ghazi Abad']"]
searched for Sanjab               : -> ["No exact match to

In [11]:
df_location

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode
0,Afghanistan,AF,Khash,AF1705,2,en,khash,6,AF.17.05
1,Afghanistan,AF,Hazrat-e-Sultan,AF2002,2,en,hazratesultan,6,AF.20.02
2,Afghanistan,AF,Pul-e-Alam,AF0501,2,en,pulealam,6,AF.05.01
3,Afghanistan,AF,Mata Khan,AF1202,2,en,matakhan,6,AF.12.02
4,Afghanistan,AF,Gosfandi,AF2206,2,en,gosfandi,6,AF.22.06
...,...,...,...,...,...,...,...,...,...
693,Syrian Arab Republic,SY,ريف دمشق,SY03,1,ar,,4,SY.03
694,Syrian Arab Republic,SY,طرطوس,SY10,1,ar,,4,SY.10
695,Syrian Arab Republic,SY,دمشق,SY01,1,ar,,4,SY.01
696,Syrian Arab Republic,SY,Syrian Arab Republic,SY,0,en,syrianarabrepublic,2,SY


In [12]:
def get_adm_lvl_from_pcode(pcode):
    return list(set(df_location['adm_lvl'][df_location['pcode'] == pcode].tolist()))

print(get_adm_lvl_from_pcode('TUR033'))

[1]


In [13]:
def get_name_in_lang(pcode, lang='en'):
    return list(set(df_location['location_name'][(df_location['pcode'] == pcode) & (df_location['lang_code'] == lang)].tolist()))

print(get_name_in_lang('UA1414001', 'en'))
print(get_name_in_lang('UA1414001', 'ru'))
print(get_name_in_lang('UA1414001', 'ua'))
print(get_name_in_lang('UA85', 'en'))

['Kalchytska']
['Кальчикская']
['Кальчицька']
['Sevastopol']


In [14]:
locations = ['Adana', 'Osmaniye', 'Hatay', 'Kilis', 'Gaziantep', 'Sanliurfa', 'Adiyaman', 'Kahramanmaras', 'Malatya', 'Elazig', 'Diyarbakir']
for l in locations:
    print(f"{l} -- {get_pcode_from_location(l)}")
    print()


Adana -- ['TUR001']

more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['TUR080', 'TUR080005']
Osmaniye -- ['TUR080', 'TUR080005']

Hatay -- ['TUR031']

more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['TUR079', 'TUR079002']
Kilis -- ['TUR079', 'TUR079002']

Gaziantep -- ['TUR027']

Sanliurfa -- ['TUR063']

more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['TUR002001', 'TUR002']
Adiyaman -- ['TUR002001', 'TUR002']

Kahramanmaras -- ['TUR046']

Malatya -- ['TUR044']

more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['TUR023005', 'TUR023']
Elazig -- ['TUR023005', 'TUR023']

Diyarbakir -- ['TUR021']



In [16]:
#get descendents of 
def get_descendents_of(pcode, lang='en', include_self=True):
    if include_self==True:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)]
    else:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)\
        & (df_location['pcode'] != pcode)]

        
get_descendents_of('TUR001', include_self=False)


Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode
973,TURKEY,TU,ALADAG,TUR001001,2,en,aladag,9,TUR.001.001
974,TURKEY,TU,CEYHAN,TUR001002,2,en,ceyhan,9,TUR.001.002
975,TURKEY,TU,CUKUROVA,TUR001003,2,en,cukurova,9,TUR.001.003
976,TURKEY,TU,FEKE,TUR001004,2,en,feke,9,TUR.001.004
977,TURKEY,TU,IMAMOGLU,TUR001005,2,en,imamoglu,9,TUR.001.005
978,TURKEY,TU,KARAISALI,TUR001006,2,en,karaisali,9,TUR.001.006
979,TURKEY,TU,KARATAS,TUR001007,2,en,karatas,9,TUR.001.007
980,TURKEY,TU,KOZAN,TUR001008,2,en,kozan,9,TUR.001.008
981,TURKEY,TU,POZANTI,TUR001009,2,en,pozanti,9,TUR.001.009
982,TURKEY,TU,SAIMBEYLI,TUR001010,2,en,saimbeyli,9,TUR.001.010


In [16]:
def get_admin_chain(pcode, lang='en'):
    split_pcode = df_location['split_pcode'][df_location['pcode'] == pcode].tolist()[0]
    levels = split_pcode.split(".")
    pc =''
    admin_chain = []
    #rebuild the pcode one level at a time
    for i in levels:
        pc = pc + i
        admin_chain.append(df_location['location_name'][(df_location['pcode'] == pc) & (df_location['lang_code'] == lang)].tolist()[0])

    return admin_chain
#UA1414001
get_admin_chain('UA0102013', 'en')

['Ukraine', 'Autonomous Republic of Crimea', 'Bakhchysaraiskyi', 'Holubynska']

In [17]:
search_loc = 'Kalchytska'
get_admin_chain('UA0102011')


['Ukraine', 'Autonomous Republic of Crimea', 'Bakhchysaraiskyi', 'Vilinska']

In [18]:
get_descendents_of('UA0116', include_self=False)

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode
217,Ukraine,UA,Hvardiiska,UA0116001,3,en,hvardiiska,9,UA.01.16.001
218,Ukraine,UA,Hresivska,UA0116003,3,en,hresivska,9,UA.01.16.003
219,Ukraine,UA,Dobrivska,UA0116005,3,en,dobrivska,9,UA.01.16.005
220,Ukraine,UA,Donska,UA0116007,3,en,donska,9,UA.01.16.007
221,Ukraine,UA,Zhuravlivska,UA0116009,3,en,zhuravlivska,9,UA.01.16.009
222,Ukraine,UA,Kolchuhynska,UA0116011,3,en,kolchuhynska,9,UA.01.16.011
223,Ukraine,UA,Mazanska,UA0116013,3,en,mazanska,9,UA.01.16.013
224,Ukraine,UA,Mykolaivska,UA0116015,3,en,mykolaivska,9,UA.01.16.015
225,Ukraine,UA,Myrnivska,UA0116017,3,en,myrnivska,9,UA.01.16.017
226,Ukraine,UA,Molodizhnenska,UA0116019,3,en,molodizhnenska,9,UA.01.16.019


In [19]:
#test loading assertions

df_assertions = pd.read_excel("D:\\projects\\_external_files\\surveyor\\assertions.xlsx")
df_assertions


Unnamed: 0,pcode,attribute,numeric_value,date_of_observation,date_of_event,source_sentence
0,UA0102005,bicycle_bought,50,2023-11-18,2023-11-10,blah blah bla
1,UA0116021,bicycle_bought,500,2023-11-18,2023-11-10,blah blah bla
2,UA01,bicycle_bought,15000,2023-11-19,2023-11-10,a different source sentence


In [20]:
df_joined = df_location.merge(df_assertions, left_on='pcode', right_on='pcode')
df_joined[df_joined['lang_code'] == 'en']

Unnamed: 0,country,pcode_prefix,location_name,pcode,adm_lvl,lang_code,location_normalized,lvl_pcode_len,split_pcode,attribute,numeric_value,date_of_observation,date_of_event,source_sentence
0,Ukraine,UA,Bakhchysaraiska,UA0102005,3,en,bakhchysaraiska,9,UA.01.02.005,bicycle_bought,50,2023-11-18,2023-11-10,blah blah bla
3,Ukraine,UA,Novoandriivska,UA0116021,3,en,novoandriivska,9,UA.01.16.021,bicycle_bought,500,2023-11-18,2023-11-10,blah blah bla
6,Ukraine,UA,Autonomous Republic of Crimea,UA01,1,en,autonomousrepublicofcrimea,4,UA.01,bicycle_bought,15000,2023-11-19,2023-11-10,a different source sentence


## END

In [21]:
dfx = df.set_index(['adm0_pcode','adm1_pcode','adm2_pcode','adm3_pcode'])
dfx

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,adm0_en,adm0_ar,adm1_en,adm1_ar,adm2_en,adm2_ar,adm3_en,adm3_ar,admin3refname_en,lastupdatedate,validon,validto
adm0_pcode,adm1_pcode,adm2_pcode,adm3_pcode,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
SY,SY07,SY0705,SY070500,Syrian Arab Republic,الجمهورية العربية السورية,Idleb,إدلب,Ariha,أريحا,Ariha,مركز أريحا,Ariha,2016-09-05,2016-09-05,
SY,SY07,SY0705,SY070501,Syrian Arab Republic,الجمهورية العربية السورية,Idleb,إدلب,Ariha,أريحا,Ehsem,احسم,Ehsem,2016-09-05,2016-09-05,
SY,SY07,SY0705,SY070502,Syrian Arab Republic,الجمهورية العربية السورية,Idleb,إدلب,Ariha,أريحا,Mhambal,محمبل,Mhambal,2016-09-05,2016-09-05,
SY,SY07,SY0704,SY070400,Syrian Arab Republic,الجمهورية العربية السورية,Idleb,إدلب,Jisr-Ash-Shugur,جسر الشغور,Jisr-Ash-Shugur,مركز جسر الشغور,Jisr-Ash-Shugur,2016-09-05,2016-09-05,
SY,SY07,SY0704,SY070401,Syrian Arab Republic,الجمهورية العربية السورية,Idleb,إدلب,Jisr-Ash-Shugur,جسر الشغور,Badama,بداما,Badama,2016-09-05,2016-09-05,
SY,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SY,SY03,SY0309,SY030900,Syrian Arab Republic,الجمهورية العربية السورية,Rural Damascus,ريف دمشق,Darayya,داريا,Markaz Darayya,مركز داريا,Markaz Darayya,2016-09-05,2016-09-05,
SY,SY03,SY0309,SY030902,Syrian Arab Republic,الجمهورية العربية السورية,Rural Damascus,ريف دمشق,Darayya,داريا,Hajar Aswad,الحجر الأسود,Hajar Aswad,2016-09-05,2016-09-05,
SY,SY01,SY0100,SY010000,Syrian Arab Republic,الجمهورية العربية السورية,Damascus,دمشق,Damascus,دمشق,Damascus,دمشق,Damascus,2016-09-05,2016-09-05,
SY,SY03,SY0301,SY030102,Syrian Arab Republic,الجمهورية العربية السورية,Rural Damascus,ريف دمشق,Rural Damascus,مركز ريف دمشق,Babella,ببيلا,Babella,2016-09-05,2016-09-05,


In [22]:
dfx_nm = df.set_index(['adm0_en','adm1_en','adm2_en','adm3_en'])
dfx_nm = dfx_nm.sort_index()
dfx_nm['num_dead'] = 10
#dfx_nm.loc[('Ukraine','Autonomous Republic of Crimea','Bakhchysaraiskyi','Aromatnenska')]['num_dead'] = 20
#dfx_nm.loc[('Ukraine','Autonomous Republic of Crimea','Bakhchysaraiskyi')][['num_dead']] = 200


In [23]:
dfx_nm[['num_dead']] #.loc[('Ukraine','Autonomous Republic of Crimea','Bakhchysaraiskyi')]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,num_dead
adm0_en,adm1_en,adm2_en,adm3_en,Unnamed: 4_level_1
Syrian Arab Republic,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,10
Syrian Arab Republic,Al-Hasakeh,Al-Hasakeh,Areesheh,10
Syrian Arab Republic,Al-Hasakeh,Al-Hasakeh,Be'r Al-Hulo Al-Wardeyyeh,10
Syrian Arab Republic,Al-Hasakeh,Al-Hasakeh,Hole,10
Syrian Arab Republic,Al-Hasakeh,Al-Hasakeh,Markada,10
Syrian Arab Republic,...,...,...,...
Syrian Arab Republic,Tartous,Tartous,Kareemeh,10
Syrian Arab Republic,Tartous,Tartous,Kherbet Elma'aza,10
Syrian Arab Republic,Tartous,Tartous,Safsafa,10
Syrian Arab Republic,Tartous,Tartous,Soda Khawabi,10


In [24]:
dfx_nm['Ukraine']['Autonomous Republic of Crimea']['Bakhchysaraiskyi']['Aromatnenska']

KeyError: 'Ukraine'

In [None]:
for i in dfx.index:
    print(i)

In [32]:
x = ['Hama', 'Harim', 'Idleb', 'Afrin', 'Aleppo', 'Kahramanmara', 'Malatya', 'Hatay', 'Syria', 'Adana', 'Gaziantep', 'Sanliurfa', 'Zonguldak', 'Ordu', 'Adiyaman', 'Sinop', 'ENAR(httpsreliefweb', 'Lattakia', 'Bingl', 'Kayseri', 'Mardin', 'Tunceli', 'Batman', 'Turkey', 'Samsun', 'Amasya', 'Jandairis', 'intsitesdefaultfilesstylessmallpublicpreviews75d275d203f6e46e4f6d8ade8c77a85cb739', 'Kastamonu', 'Homs', 'afadTasarimafadlogoen', 'Tartous', 'intnode3976432', 'intnode3934516', 'Provinces of', 'intnode3941882', 'Mersin', 'intnode3963175', 'intnode3938340', 'Sakarya', 'Bolu', 'Bartin', 'Karabk', 'Giresun', 'intnode3969031', 'intnode3944550', 'Naqaa']

for l in x:
    r = get_pcode_from_location(l)

    if len(r) > 0:
        if r[0][:2] == 'TU':
            country = 'Turkiye'
        elif r[0][:2] == 'SY':
            country = 'Syria'
            
        print(f"{l};{l}, {country}")

more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['SY050100', 'SY05', 'SY0501']
Hama;Hama, Syria
more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['SY0703', 'SY070300']
Harim;Harim, Syria
more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['SY07', 'SY0700', 'SY070000']
Idleb;Idleb, Syria
more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['SY020300', 'SY0203']
Afrin;Afrin, Syria
Aleppo;Aleppo, Syria
Kahramanmara;Kahramanmara, Turkiye
Malatya;Malatya, Turkiye
Hatay;Hatay, Turkiye
Adana;Adana, Turkiye
Gaziantep;Gaziantep, Turkiye
Sanliurfa;Sanliurfa, Turkiye
more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) ['TUR067008'