# proGres v4 Data Quality Script

# Section 1: Setup / Import data

## 1.1 Notebook setup

#### Import libraries needed

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime as datetime
import pyodbc
import urllib
import sqlalchemy
import requests
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 0)

## 1.2 Import proGres v4 Individual and Location Level Tables

### Connect to database

In [3]:
# text1 = ''
# text2 = ''
saveFile = open('pwd.txt', 'r+')
# saveFile.write(text1 + ',' + text2)
# saveFile.seek(0)
uap = saveFile.read()

In [4]:
ip_of_server = "**.***.**.**"
driver = 'SQL Server'
ip = ip_of_server
db_connection = pyodbc.connect(
    driver = driver,
    Server = ip,
    Port = "1433",
    Database = "QualityCheck_Egypt",
    UID = uap.split(',')[0],
    PWD = uap.split(',')[1])

In [5]:
# checking
pyodbc.drivers()

['SQL Server']

In [6]:
# checking
type(db_connection)

pyodbc.Connection

### 1.1.1 Import v4 Individual table

In [7]:
dfi = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_individual", db_connection)
dfi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290220 entries, 0 to 290219
Data columns (total 55 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   modifiedon                         290220 non-null  datetime64[ns]
 1   createdon                          290220 non-null  datetime64[ns]
 2   progres_registrationgroupidname    290220 non-null  object        
 3   progres_placeofbirthidname         290163 non-null  object        
 4   createdbyname                      290220 non-null  object        
 5   progres_indvidualid                290220 non-null  object        
 6   progres_sexname                    290220 non-null  object        
 7   progres_id                         290220 non-null  object        
 8   progres_religionidname             290213 non-null  object        
 9   progres_isdeceased                 290220 non-null  bool          
 10  modifiedbyname      

#### Filter for active and hold and Business Unit in MENA

In [8]:
dfi[["statuscode" , "statuscodename", "progres_individualid"]].groupby(["statuscode" , "statuscodename"], dropna=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,progres_individualid
statuscode,statuscodename,Unnamed: 2_level_1
1,Active,290178
125080000,Hold,42


In [9]:
# Filter for statuscode [1, 125080000] 
# which correspond to statuscodename [Active, Hold] respectively
dfi = dfi[dfi.statuscode.isin([1, 125080000])]
dfi.shape

(290220, 55)

In [10]:
# Only keep MENA business units

In [11]:
list_mena_bu = ['Mauritania - Bassikounou',
                'Mauritania - Urban',
                'Tunisia - CO',
                'Egypt - CO',
                'Lebanon - North',
                'Lebanon - BML',
                'Lebanon - South',
                'Lebanon - Bekaa',
                'Algeria - CO',
                'Iraq - CO',
                'Israel - CO',
                'Jordan - Camps',
                'Jordan - Urban',
                'Kuwait - CO',
                'Morocco - CO',
                'RO Riyadh',
                'Syria - CO',
                'UAE - CO']

In [12]:
# dfi[["progres_businessunit" , 
#      "progres_businessunitname", 
#      "progres_individualid"]].groupby(["progres_businessunit" , 
#                                        "progres_businessunitname"], dropna=False).count()

In [13]:
dfi = dfi[dfi.progres_businessunitname.isin(list_mena_bu)]
dfi.shape

(290220, 55)

#### Check for and remove any duplicated entries

In [14]:
dfi[dfi.duplicated(keep=False)].head()

Unnamed: 0,modifiedon,createdon,progres_registrationgroupidname,progres_placeofbirthidname,createdbyname,progres_indvidualid,progres_sexname,progres_id,progres_religionidname,progres_isdeceased,modifiedbyname,statuscodename,progres_coolocationlevel1name,progres_familyname,progres_fathersname,progres_nationalitylookupname,progres_registrationgroupid,progres_countryoforiginid,progres_maritalstatusname,progres_coolocationlevel2name,progres_registrationreason,progres_agecohortname,progres_relationshiptofp,progres_businessunit,progres_registrationdate,progres_individualid,progres_coalocationlevel1name,progres_refugeestatuscategoryname,progres_age,progres_sex,progres_biometricstatus,progres_dateofbirth,progres_ethnicityidname,progres_registrationreasonname,progres_countryofasylumidname,progres_coolocationlevel3name,progres_givenname,progres_refugeestatus,progres_countryofasylumid,progres_maritalstatus,statuscode,progres_primaryphonenumber,progres_coalocationlevel2name,progres_coalocationlevel3name,progres_businessunitname,progres_arrivaldate,progres_refugeestatusname,progres_relationshiptofpname,progres_deceaseddate,progres_countryoforiginidname,progres_nationalitylookup,progres_placeofbirthcity,progres_fleddate,progres_hasphoto,progres_educationlevelname


### 1.1.2 Import v4 Admin Location Levels Table

In [15]:
admin = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_locationlevel", db_connection)

In [16]:
admin.statuscodename.value_counts()

Active    442999
Name: statuscodename, dtype: int64

In [17]:
# admin = admin[admin.statecodename=="Active"]
admin.shape

(442999, 41)

In [18]:
admin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442999 entries, 0 to 442998
Data columns (total 41 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   createdby                     442999 non-null  object        
 1   createdbyname                 442999 non-null  object        
 2   createdbyyominame             442999 non-null  object        
 3   createdon                     442999 non-null  datetime64[ns]
 4   createdonutc                  442999 non-null  datetime64[ns]
 5   createdonbehalfby             0 non-null       object        
 6   createdonbehalfbyname         0 non-null       object        
 7   createdonbehalfbyyominame     0 non-null       object        
 8   importsequencenumber          165503 non-null  float64       
 9   modifiedby                    442999 non-null  object        
 10  modifiedbyname                442999 non-null  object        
 11  modifiedbyyom

In [19]:
admin = admin[(admin.progres_countryidname=="Egypt")|(admin.progres_countryidname=="Syrian Arab Republic")]\
        [[ 'progres_code',
       'progres_countryid', 'progres_countryidname',
       'progres_description_1025', 'progres_description_1036',
       'progres_description_3082', 'progres_filtercode', 'progres_level',
       'progres_locationlevel', 'progres_locationlevelid']]
admin.shape

(13599, 10)

In [20]:
# not sure if we should be using progres_filtercode or progres_code; they don't always match
# most of these are at level 4, one at level 3
admin[admin.progres_filtercode!=admin.progres_code].shape

(38, 10)

In [21]:
# ahmed
# record(s) at level 3 where the two pcode fields in v4 don't match
# Al Makhrim is a Level 2 and Level 3 Name but pcode here is wrong, in the OCHA list, the pcode is SY040600 (NOT SY040650 nor SY040660)
admin[(admin.progres_filtercode!=admin.progres_code)&
     (admin.progres_level==3)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid
252167,21SYR004006060,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Al Makhrim,F_Al Makhrim,S_Al Makhrim,21SYR004006050,3.0,Al Makhrim,F15D0D53-6F8F-EC11-8140-001DD8B71FE3


In [22]:
# ahmed
# record(s) at level 4 where the two pcode fields in v4 don't match
admin[(admin.progres_filtercode!=admin.progres_code)&
     (admin.progres_level==4)].head(2)

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid
77334,21SYR001000008009,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Tabbaleh,F_Tabbaleh,S_Tabbaleh,21SYR001000008001,4.0,Tabbaleh,3EFF797F-718F-EC11-811F-00155D5CC4A0
77351,21SYR001000008010,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Damascus City,F_Damascus City,S_Damascus City,21SYR001000008001,4.0,Damascus City,4FFF797F-718F-EC11-811F-00155D5CC4A0


In [23]:
admin[admin.progres_countryidname=="Egypt"].progres_level.value_counts()

3.0    893
2.0    373
1.0    29 
Name: progres_level, dtype: int64

In [24]:
admin[admin.progres_countryidname=="Syrian Arab Republic"].progres_level.value_counts()

4.0    6595
3.0    5633
2.0    62  
1.0    14  
Name: progres_level, dtype: int64

In [25]:
admin["len_pcode_v1"] = admin["progres_code"].apply(lambda x: len(x))
admin["len_pcode_v2"] = admin["progres_filtercode"].apply(lambda x: len(x))
admin.groupby(["progres_level", "len_pcode_v1", "len_pcode_v2"]).progres_filtercode.count()

progres_level  len_pcode_v1  len_pcode_v2
1.0            6             6               1   
               8             8               42  
2.0            11            11              435 
3.0            14            14              6526
4.0            17            17              6595
Name: progres_filtercode, dtype: int64

In [26]:
# ahmed
# There is one entry where the length of unhcr pcode is wrong so we'll omit this one
admin[(admin.len_pcode_v1==6)&(admin.progres_level==1)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2
412042,EGY000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_-,F_-,S_-,EGY000,1.0,-,DCA29815-C25F-E711-80C5-001DD8B71FE4,6,6


In [27]:
# remove addresses at level 4 and those with UNCHR pcodes of length 6
admin = admin[(admin.len_pcode_v1!=6) & (admin.progres_level!=4)]

In [28]:
admin.loc[:,"len_pcode_v1"] = admin["progres_code"].apply(lambda x: len(x))
admin.loc[:,"len_pcode_v2"] = admin["progres_filtercode"].apply(lambda x: len(x))
admin.groupby(["progres_level", "len_pcode_v1", "len_pcode_v2"]).progres_filtercode.count()

progres_level  len_pcode_v1  len_pcode_v2
1.0            8             8               42  
2.0            11            11              435 
3.0            14            14              6526
Name: progres_filtercode, dtype: int64

In [29]:
# Convert UNHCR pcodes to OCHA pcodes
admin.loc[:,"ocha_pcode"] = admin["progres_code"].apply(lambda x: x[2:4] + x[6:8] + x[9:11] + x[12:14] if len(x)==14  
                                                           else(x[2:4] + x[6:8] + x[9:11] if len(x)==11
                                                                else(x[2:4] + x[6:8])))

In [30]:
admin.loc[:,"len_pcode_ocha"] = admin["ocha_pcode"].apply(lambda x: len(x))
admin.groupby(["progres_level", "len_pcode_ocha"], dropna=False).progres_filtercode.count()

progres_level  len_pcode_ocha
1.0            4                 42  
2.0            6                 435 
3.0            8                 6526
Name: progres_filtercode, dtype: int64

In [31]:
admin[admin.progres_countryidname=="Egypt"].progres_level.value_counts()

3.0    893
2.0    373
1.0    28 
Name: progres_level, dtype: int64

In [32]:
admin[admin.progres_countryidname=="Syrian Arab Republic"].progres_level.value_counts()

3.0    5633
2.0    62  
1.0    14  
Name: progres_level, dtype: int64

In [33]:
admin[admin.progres_filtercode!=admin.progres_code]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha
252167,21SYR004006060,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Al Makhrim,F_Al Makhrim,S_Al Makhrim,21SYR004006050,3.0,Al Makhrim,F15D0D53-6F8F-EC11-8140-001DD8B71FE3,14,14,SY040660,8


In [34]:
admin[admin.progres_level==1].sample(1)

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha
299779,20EGY031,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Red Sea,F_Red Sea,S_Red Sea,20EGY031,1.0,Red Sea,B0F0AC72-E114-E911-80DE-001DD8B71FE4,8,8,EG31,4


## 1.3 Import official OCHA names for Egypt

In [35]:
# Source: https://data.humdata.org/dataset/cod-ab-egy # no location data provided so only taking admin3, admin2, and admin1 
egypt_ocha3 = pd.read_excel(r'egy_adminboundaries_tabulardata.xlsx', sheet_name='Admin3')
egypt_ocha2 = pd.read_excel(r'egy_adminboundaries_tabulardata.xlsx', sheet_name='Admin2')
egypt_ocha1 = pd.read_excel(r'egy_adminboundaries_tabulardata.xlsx', sheet_name='Admin1')

In [36]:
# Label admin levels and add extra column needed for joins with v4 data
egypt_ocha3["progres_level"] = 3
egypt_ocha3["ocha_pcode"] = egypt_ocha3["admin3Pcode"]
egypt_ocha2["progres_level"] = 2
egypt_ocha2["ocha_pcode"] = egypt_ocha2["admin2Pcode"]
egypt_ocha1["progres_level"] = 1
egypt_ocha1["ocha_pcode"] = egypt_ocha1["admin1Pcode"]
# concantenate data from levels 1 2 and 3
egypt_ocha = pd.concat([egypt_ocha3, egypt_ocha2, egypt_ocha1]).reset_index().drop(columns=["index"])

In [37]:
egypt_ocha.head(2)

Unnamed: 0,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,progres_level,ocha_pcode,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
0,1,Polygon,<Null>,الجزيره,EG120918,<Null>,<Null>,<Null>,<Null>,<Null>,Dikirnis,مركز دكرنس,EG1209,Dakahlia,الدقهلية,EG12,Egypt,مِصر,EG,2006-01-01,2017-04-21,<Null>,0.055274,0.000157,3,EG120918,,,,,,,,,,,
1,2,Polygon,<Null>,الجهاد 4,EG240525,<Null>,<Null>,<Null>,<Null>,<Null>,Markz Al Idwa,مركز العدوة,EG2405,Menia,المنيا,EG24,Egypt,مِصر,EG,2006-01-01,2017-04-21,<Null>,0.101889,0.000594,3,EG240525,,,,,,,,,,,


## 1.4 Import official OCHA names for Syria

In [38]:
syria_ocha_loc = pd.read_excel(r'syr_humanitarian_locations_20200816.xlsx', sheet_name='Location')

In [39]:
# Source: https://data.humdata.org/dataset/cod-ab-syr
syria_ocha1 = pd.read_excel(r'syr_humanitarian_locations_20200816.xlsx', sheet_name='admin1')
syria_ocha2= pd.read_excel(r'syr_humanitarian_locations_20200816.xlsx', sheet_name='admin2')
syria_ocha3 = pd.read_excel(r'syr_humanitarian_locations_20200816.xlsx', sheet_name='admin3')
# add details on level and create new column to join with v4 on
syria_ocha3["progres_level"] = 3
syria_ocha3["ocha_pcode"] = syria_ocha3["admin3Pcode"]
syria_ocha2["progres_level"] = 2
syria_ocha2["ocha_pcode"] = syria_ocha2["admin2Pcode"]
syria_ocha1["progres_level"] = 1
syria_ocha1["ocha_pcode"] = syria_ocha1["admin1Pcode"]
# concantenate data from levels 1 2 and 3
syria_ocha = pd.concat([syria_ocha3, syria_ocha2, syria_ocha1]).reset_index().drop(columns=["index"])

In [40]:
syria_ocha.head(2)

Unnamed: 0,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,progres_level,ocha_pcode,admin2RefName,admin1RefName
0,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Ariha,أريحا,SY0705,Ariha,مركز أريحا,SY070500,Ariha,2016-09-05,2016-09-05,,3,SY070500,,
1,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Ariha,أريحا,SY0705,Ehsem,احسم,SY070501,Ehsem,2016-09-05,2016-09-05,,3,SY070501,,


#### Compare between OCHA's location sheet (which exists only for Syria) and the combined admin 1 2 3 list for OCHA

In [41]:
# pcodes that exist in OCHA combined list but not in OCHA's location level list
[ x for x in syria_ocha[syria_ocha.progres_level==3].ocha_pcode.unique().tolist()
 if x not in syria_ocha_loc.admin3Pcode.unique().tolist()]

['SY140201', 'SY140003']

In [42]:
# pcodes that exist in OCHA combined list but not in location level list
[ x for x in syria_ocha[syria_ocha.progres_level==2].ocha_pcode.unique().tolist()
 if x not in syria_ocha_loc.admin2Pcode.unique().tolist()]

[]

In [43]:
# pcodes that exist in OCHA combined list but not in location level list
[ x for x in syria_ocha[syria_ocha.progres_level==1].ocha_pcode.unique().tolist()
 if x not in syria_ocha_loc.admin1Pcode.unique().tolist()]

[]

In [44]:
# ahmed kaleem
# Example: these two exist in levels sheets of ocha spreadsheet but not the location list
syria_ocha[syria_ocha.ocha_pcode.isin(['SY140201', 'SY140003'])]

Unnamed: 0,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,progres_level,ocha_pcode,admin2RefName,admin1RefName
65,Syrian Arab Republic,الجمهورية العربية السورية,SY,Quneitra,القنيطرة,SY14,Al Fiq,فيق,SY1402,Al-Butayhah,البطيحة,SY140201,Al-Butayhah,2016-09-05,2016-09-05,,3,SY140201,,
69,Syrian Arab Republic,الجمهورية العربية السورية,SY,Quneitra,القنيطرة,SY14,Quneitra,مركز القنيطرة,SY1400,Masaada,مسعدة,SY140003,Masaada,2016-09-05,2016-09-05,,3,SY140003,,


In [45]:
syria_ocha_loc[syria_ocha_loc.admin3Pcode.isin(['SY140201', 'SY140003'])]

Unnamed: 0,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin4Name_en,admin4Name_ar,admin4Pcode,LocationName_en,LocationName_ar,Location_Pcode,location_type,LastUpdateDate,Latitude_y,Longitude_x


In [46]:
# this means that the separate admin lists are more comprehensive than the location sheet in the OCHA list

In [47]:
[ x for x in syria_ocha_loc.admin1Pcode.unique().tolist()  if x not in syria_ocha[syria_ocha.progres_level==1].ocha_pcode.unique().tolist()]

[]

In [48]:
[ x for x in syria_ocha_loc.admin2Pcode.unique().tolist()  if x not in syria_ocha[syria_ocha.progres_level==2].ocha_pcode.unique().tolist()]

[]

In [49]:
[ x for x in syria_ocha_loc.admin3Pcode.unique().tolist()  if x not in syria_ocha[syria_ocha.progres_level==3].ocha_pcode.unique().tolist()]

[]

# Section 2: Combine v4 data with OCHA data

In [50]:
# v4 location level data from Egypt
egypt_v4 = admin[admin.progres_countryidname=="Egypt"]
syria_v4 = admin[admin.progres_countryidname=="Syrian Arab Republic"]

In [51]:
# Merge v4 data with ocha data
# could do left join instead if we decide are only interested in the ocha list 
# that and will tag all other addresses as erroneous if their respective pcodes exist in v4 but not in ocha
egypt = pd.merge(egypt_v4, egypt_ocha, how="outer", on=["progres_level", "ocha_pcode"])
syria = pd.merge(syria_v4, syria_ocha, how="outer", on=["progres_level", "ocha_pcode"])

In [52]:
print("Egypt", egypt_ocha.shape, egypt_v4.shape, egypt.shape)
print("Syria", syria_ocha.shape, syria_v4.shape, syria.shape)

Egypt (6108, 37) (1294, 14) (6216, 49)
Syria (348, 20) (5709, 14) (5746, 32)


## 2.1 But first: Check between dfi table names and v4 location level table

In [53]:
# Lists of level 1 level 2 and level 3 addresses in the individual tables for Egypt and Syria 

eg1 = dfi[dfi.progres_countryofasylumidname=="Egypt"].progres_coalocationlevel1name.unique().tolist()
eg2 = dfi[dfi.progres_countryofasylumidname=="Egypt"].progres_coalocationlevel2name.unique().tolist()
eg3 = dfi[dfi.progres_countryofasylumidname=="Egypt"].progres_coalocationlevel3name.unique().tolist()

sy1 = dfi[dfi.progres_countryoforiginidname=="Syrian Arab Republic"].progres_coolocationlevel1name.unique().tolist()
sy2 = dfi[dfi.progres_countryoforiginidname=="Syrian Arab Republic"].progres_coolocationlevel3name.unique().tolist()
sy3 = dfi[dfi.progres_countryoforiginidname=="Syrian Arab Republic"].progres_coolocationlevel3name.unique().tolist()


### Egypt: Are all names in dfi in v4 location level table?
not many issues just Abu Durba in Egypt

In [54]:
# bad level 1
len([x for x in eg1 if x not in egypt[egypt.progres_level==1].progres_locationlevel.unique().tolist()])

2

In [55]:
# bad level 1 details 
[x for x in eg1 if x not in egypt[egypt.progres_level==1].progres_locationlevel.unique().tolist()][:5]

[None, '-']

In [56]:
# good level 1
len([x for x in eg1 if x in egypt[egypt.progres_level==1].progres_locationlevel.unique().tolist()])

27

In [57]:
# bad level 2
len([x for x in eg2 if x not in egypt[egypt.progres_level==2].progres_locationlevel.unique().tolist()])

2

In [58]:
# bad level 2 details
# Because Abu Durba exists in the location level table at level 3, not 2! See below
[x for x in eg2 if x not in egypt[egypt.progres_level==2].progres_locationlevel.unique().tolist()]

[None, 'Abu Durba']

In [59]:
# ahmed
# this exists in dfi individual table at level 2 but not in the location levels table at level 2 (in location levels table, it's listed as a level 3 location)
# could this be because of open text field?
dfi[dfi.progres_coalocationlevel2name=="Abu Durba"][["progres_coalocationlevel1name",
                                                     "progres_coalocationlevel2name",
                                                     "progres_coalocationlevel3name"]]

Unnamed: 0,progres_coalocationlevel1name,progres_coalocationlevel2name,progres_coalocationlevel3name
14112,South Sinai,Abu Durba,
14114,South Sinai,Abu Durba,
14116,South Sinai,Abu Durba,
14118,South Sinai,Abu Durba,
17456,South Sinai,Abu Durba,
179218,South Sinai,Abu Durba,
205906,South Sinai,Abu Durba,
205909,South Sinai,Abu Durba,
205912,South Sinai,Abu Durba,


In [60]:
admin[admin.progres_locationlevel=="Abu Durba"].head()

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha
323247,20EGY035001001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Abu Durba,F_Abu Durba,S_Abu Durba,20EGY035001001,3.0,Abu Durba,74FAAC72-E114-E911-80DE-001DD8B71FE4,14,14,EG350101,8


In [61]:
# good level 2
len([x for x in eg2 if x in egypt[egypt.progres_level==2].progres_locationlevel.unique().tolist()])

283

In [62]:
# bad level 3 
len([x for x in eg3 if x not in egypt[egypt.progres_level==3].progres_locationlevel.unique().tolist()])

1

In [63]:
# bad level 3 details
[x for x in eg3 if x not in egypt[egypt.progres_level==3].progres_locationlevel.unique().tolist()]

[None]

In [64]:
# good level 3
len([x for x in eg3 if x in egypt[egypt.progres_level==3].progres_locationlevel.unique().tolist()])

75

### Syria: Are all names in dfi in v4 location level table?
More issues in Syria, including names that don't even exist in Syria are listed as location level 1 

In [65]:
# bad level 1
len([x for x in sy1 if x not in syria[syria.progres_level==1].progres_locationlevel.unique().tolist()])

11

In [66]:
# ahmed
# bad level 1 details
# names that don't even exist in Syria are listed as location level 1 : eg. Cairo, Baghdad, Khartoum etc.
[x for x in sy1 if x not in syria[syria.progres_level==1].progres_locationlevel.unique().tolist()][:12]

[None,
 'Awdal',
 'Al-hasakeh',
 '-',
 'Khartoum',
 'North Kordofan',
 'Tigray',
 'Bursa',
 'Cairo',
 'Baghdad',
 'Erbil']

In [67]:
# good level 1
len([x for x in sy1 if x in syria[syria.progres_level==1].progres_locationlevel.unique().tolist()])

14

In [68]:
# ahmed
# bad level 2
# names must be typed up because we see Damascus (Joubar) ? 
[x for x in sy2 if x not in syria[syria.progres_level==2].progres_locationlevel.unique().tolist()][:15]

['Babella',
 None,
 'El-maidaan',
 'Hajar Aswad',
 'Tadhamon',
 'Damascus (Joubar)',
 'Harasta',
 'Jobar',
 'El-shaghoor',
 'Arbin',
 'El-mazzeh',
 'Masaken Barzeh',
 'Sahnaya',
 'Ein El-Bayda',
 'Kisweh']

In [69]:
# bad level 2
len([x for x in sy2 if x not in syria[syria.progres_level==2].progres_locationlevel.unique().tolist()])

155

In [70]:
# good level 2
len([x for x in sy2 if x in syria[syria.progres_level==2].progres_locationlevel.unique().tolist()])

42

In [71]:
# bad level 3
len([x for x in sy3 if x not in syria[syria.progres_level==3].progres_locationlevel.unique().tolist()])

1

In [72]:
# bad level 3
[x for x in sy3 if x not in syria[syria.progres_level==3].progres_locationlevel.unique().tolist()][:10]

[None]

In [73]:
# good level 3
len([x for x in sy3 if x in syria[syria.progres_level==3].progres_locationlevel.unique().tolist()])

196

##  2.2 Explore how many matched between v4 and ocha, how many didn't based on pcodes
they didn't match because the pcodes didn't exist in the right table or didn't exist at all 

#### Egypt issues with v4 and ocha matches

In [74]:
# pcodes that exist in OCHA but not in v4 (this is not so much of a problem? maybe people did not come from these locations?) 
egypt[(egypt.len_pcode_v1.isna())].shape

(4922, 49)

In [75]:
# pcodes that exist in OCHA but not in v4 (this is not so much of a problem? maybe people did not come from these locations?) 
egypt[egypt.len_pcode_v1.isna()].progres_level.value_counts()

3.0    4921
2.0    1   
Name: progres_level, dtype: int64

In [76]:
# ahmed
# pcodes that exist in OCHA but not in v4 (this is not so much of a problem? maybe people did not come from these locations?) 
# Egypt "Halayib" with pcode EG3107
egypt[(egypt.len_pcode_v1.isna()) & (egypt.progres_level==2)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
6215,,,,,,,,2.0,,,,,EG3107,,167.0,Polygon,,,,,,,,,Halayib,قسم حلايب,EG3107,Red Sea,البحر الأحمر,EG31,Egypt,مِصر,EG,2006-01-01,2017-04-21,,3.902031,0.662337,Halayib,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,


In [77]:
# egypt_v4[(egypt_v4.progres_level==2)&(egypt_v4.ocha_pcode=="EG3107")]

In [78]:
# pcodes that exist in  v4 but not in OCHA list
egypt[egypt.admin0Pcode.isna()].shape

(100, 49)

In [79]:
# pcodes that exist in v4 but not in OCHA
egypt[egypt.admin0Pcode.isna()].progres_level.value_counts()

3.0    98
1.0    1 
2.0    1 
Name: progres_level, dtype: int64

In [80]:
# ahmed
# pcodes that exist in v4 but not in OCHA
# This is a disputed admin2 location: Shallatin - Disputed
egypt[(egypt.admin0Pcode.isna()) &(egypt.progres_level==2)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
400,20EGY040001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Shallatin - Disputed,F_Shallatin - Disputed,S_Shallatin - Disputed,20EGY040001,2.0,Shallatin - Disputed,92F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG4001,6.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,


In [81]:
# ahmed
# pcodes that exist in v4 but not in OCHA
# This is a disputed admin1 location: Ma'tan al-Sarra - Disputed
egypt[(egypt.admin0Pcode.isna()) &(egypt.progres_level==1)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
31,20EGY040,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ma'tan al-Sarra - Disputed,F_Ma'tan al-Sarra - Disputed,S_Ma'tan al-Sarra - Disputed,20EGY040,1.0,Ma'tan al-Sarra - Disputed,A0F0AC72-E114-E911-80DE-001DD8B71FE4,8.0,8.0,EG40,4.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,


In [82]:
# ahmed
# pcodes that exist in v4 but not in OCHA

# Example: ocha_pcode EG250003 or EG240003	both at level 3 and names Al Itmaniyah and Gabal el Teir respectively did not exist in the OCHA list

# another example see for progres_level = 3: 
# EG020001 Kharj Al-Zamam and EG400101 Shallatin
# Kharj Al-Zamam doesn't exist in the OCHA list and
# Shallatin is at admin2 level in OCHA (EG3106) 
# egypt[egypt.admin0Pcode.isna()].iloc[:,24:].head(20)
egypt[egypt.admin0Pcode.isna()].head(20)

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
31,20EGY040,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ma'tan al-Sarra - Disputed,F_Ma'tan al-Sarra - Disputed,S_Ma'tan al-Sarra - Disputed,20EGY040,1.0,Ma'tan al-Sarra - Disputed,A0F0AC72-E114-E911-80DE-001DD8B71FE4,8.0,8.0,EG40,4.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
400,20EGY040001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Shallatin - Disputed,F_Shallatin - Disputed,S_Shallatin - Disputed,20EGY040001,2.0,Shallatin - Disputed,92F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG4001,6.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
438,20EGY002000001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Kharj Al-Zamam,F_Kharj Al-Zamam,S_Kharj Al-Zamam,20EGY002000001,3.0,Kharj Al-Zamam,DEF3AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG020001,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
524,20EGY040001001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Shallatin,F_Shallatin,S_Shallatin,20EGY040001001,3.0,Shallatin,8AF4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG400101,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
525,20EGY014000001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ezbet el-Insha,F_Ezbet el-Insha,S_Ezbet el-Insha,20EGY014000001,3.0,Ezbet el-Insha,8CF4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG140001,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
526,20EGY001000001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ash Shabasiyah,F_Ash Shabasiyah,S_Ash Shabasiyah,20EGY001000001,3.0,Ash Shabasiyah,8EF4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG010001,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
527,20EGY001000002,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Kafr ash Shara inah,F_Kafr ash Shara inah,S_Kafr ash Shara inah,20EGY001000002,3.0,Kafr ash Shara inah,90F4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG010002,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
528,20EGY001000003,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Kafr Mas ud,F_Kafr Mas ud,S_Kafr Mas ud,20EGY001000003,3.0,Kafr Mas ud,92F4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG010003,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
560,20EGY004005002,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ezbet Farhan Eid,F_Ezbet Farhan Eid,S_Ezbet Farhan Eid,20EGY004005002,3.0,Ezbet Farhan Eid,D2F4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG040502,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,
561,20EGY004005003,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Ma diyah,F_Ma diyah,S_Ma diyah,20EGY004005003,3.0,Ma diyah,D4F4AC72-E114-E911-80DE-001DD8B71FE4,14.0,14.0,EG040503,8.0,,,,,,,,,,,,,,,,,,,,NaT,NaT,,,,,,,,,,,,,,


#### Syria issues with v4 and ocha matches

In [83]:
# pcodes that exist in OCHA but not in v4 (this is not so much of a problem? maybe people did not come from these locations?) 
syria[syria.len_pcode_v1.isna()].shape

(37, 32)

In [84]:
# ahmed 
# pcodes that exist in OCHA but not in v4
syria[syria.len_pcode_v1.isna()].head()

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,admin2RefName,admin1RefName
5709,,,,,,,,3.0,,,,,SY070500,,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Ariha,أريحا,SY0705,Ariha,مركز أريحا,SY070500,Ariha,2016-09-05,2016-09-05,,,
5710,,,,,,,,3.0,,,,,SY070400,,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Jisr-Ash-Shugur,جسر الشغور,SY0704,Jisr-Ash-Shugur,مركز جسر الشغور,SY070400,Jisr-Ash-Shugur,2016-09-05,2016-09-05,,,
5711,,,,,,,,3.0,,,,,SY070300,,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Harim,حارم,SY0703,Harim,مركز حارم,SY070300,Harim,2016-09-05,2016-09-05,,,
5712,,,,,,,,3.0,,,,,SY110300,,Syrian Arab Republic,الجمهورية العربية السورية,SY,Ar-Raqqa,الرقة,SY11,Ath-Thawrah,الثورة,SY1103,Al-Thawrah,مركز الثورة,SY110300,Al-Thawrah,2016-09-05,2016-09-05,,,
5713,,,,,,,,3.0,,,,,SY130300,,Syrian Arab Republic,الجمهورية العربية السورية,SY,As-Sweida,السويداء,SY13,Shahba,شهبا,SY1303,Shahba,مركز شهبا,SY130300,Shahba,2016-09-05,2016-09-05,,,


In [85]:
# pcodes that exist in OCHA but not in v4
syria[syria.len_pcode_v1.isna()].progres_level.value_counts()

3.0    37
Name: progres_level, dtype: int64

In [86]:
# pcodes that exist in v4 but not in the OCHA list
syria[syria.admin0Pcode.isna()].shape

(5277, 32)

In [87]:
# ahmed
# pcodes that exist in v4 but not in the OCHA list

# Some are listed under level 3 in v4, when they probably should be in level 4 according to the OCHA list 
# Examples: 
# Southern Tal Arish is in Abu Qalqal (SY020501 not SY020532), 
# Western Sweida is in Hama (SY050100 not SY050181), 
# Zenbaq in in Ein Issa (SY110202 not SY110252)

# Others simply have the wrong pcode (eg. Nabul, which has pcode SY040404, not SY020477 as listed in progres v4
# syria[syria.admin0Pcode.isna()].sample(7)
syria[syria.progres_locationlevel.isin(["Southern Tal Arish", "Abu Qalqal", "Western Sweida", "Nabul"])]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,admin2RefName,admin1RefName
526,21SYR002004077,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Nabul,F_Nabul,S_Nabul,21SYR002004077,3.0,Nabul,035F097B-628F-EC11-8128-001DD8B71FE5,14.0,14.0,SY020477,8.0,,,,,,,,,,,,,,,,,,
2004,21SYR002005007,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Abu Qalqal,F_Abu Qalqal,S_Abu Qalqal,21SYR002005007,3.0,Abu Qalqal,00DEA778-628F-EC11-8120-00155D5CC4A1,14.0,14.0,SY020507,8.0,,,,,,,,,,,,,,,,,,
2151,21SYR002005232,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Southern Tal Arish,F_Southern Tal Arish,S_Southern Tal Arish,21SYR002005232,3.0,Southern Tal Arish,7FBA6678-628F-EC11-811F-00155D5CC4A0,14.0,14.0,SY020532,8.0,,,,,,,,,,,,,,,,,,
3090,21SYR005001181,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Western Sweida,F_Western Sweida,S_Western Sweida,21SYR005001181,3.0,Western Sweida,7E03B87E-628F-EC11-811F-00155D5CC4A0,14.0,14.0,SY050181,8.0,,,,,,,,,,,,,,,,,,


## 2.3 Filtering for 100% matched pcodes: how do names differ between v4 location level table and OCHA table
If the pcodes matched but the names didn't match

### Egypt: Examine table matched with v4 and OCHA codes

In [88]:
egypt_matched = egypt[(~egypt.admin1Pcode.isna()) & (~egypt.len_pcode_v1.isna())]
eg_1_match = egypt_matched[(egypt_matched.progres_level==1)].progres_locationlevel.unique().tolist()
eg_2_match = egypt_matched[(egypt_matched.progres_level==2)].progres_locationlevel.unique().tolist()
eg_3_match = egypt_matched[(egypt_matched.progres_level==3)].progres_locationlevel.unique().tolist()

In [89]:
# good ones level 3 : this is fine because location level 3 is not provided in english in the ocha list
egypt_matched[(egypt_matched.progres_level==3) & (egypt_matched.admin3Name_en== egypt_matched.progres_locationlevel)].shape

(0, 49)

In [90]:
# bad ones level 3: this is fine because location level 3 is not provided in english in the ocha list
egypt_matched[(egypt_matched.progres_level==3) & (egypt_matched.admin3Name_en!= egypt_matched.progres_locationlevel)].shape

(795, 49)

In [91]:
# good ones level 2: this is fine because location level 3 is not provided in english in the ocha list
egypt_matched[(egypt_matched.progres_level==2) & (egypt_matched.admin2Name_en== egypt_matched.progres_locationlevel)].shape

(342, 49)

In [92]:
# bad ones level 2: this is fine because location level 3 is not provided in english in the ocha list
egypt_matched[(egypt_matched.progres_level==2) & (egypt_matched.admin2Name_en!= egypt_matched.progres_locationlevel)].shape

(30, 49)

In [93]:
# ahmed
# bad ones level 2 explore
# this includes Meet Abou Ghaleb (which should actually be listed as Kafr Sad at location 2 level, according to OCHA, but pcodes match so it's okay)
# These are the ones where it doesn't look like there's a match, but when we match by pcodes there is a match
egypt_matched[(egypt_matched.progres_level==2) & (egypt_matched.admin2Name_en!= egypt_matched.progres_locationlevel)][["ocha_pcode", "admin2Name_en", "progres_locationlevel"]]

Unnamed: 0,ocha_pcode,admin2Name_en,progres_locationlevel
0,EG1101,Dumyat 1,El Sroo
2,EG1102,Dumyat,Izbet Elbarj
4,EG1103,Fariskur,Kafr Batikh
6,EG1104,Kafr Sad,Meet Abou Ghaleb
8,EG1201,El Mansora 1,Al Kordi
10,EG1501,Kafr Al-Shaykh,Fouh
12,EG1901,Ismailiyya 1,New Alkasaseen
14,EG3101,Hurghada 1,Gulf Of Sueiz
76,EG0305,Al-Dawahy,Al-Dawahy
88,EG2606,Al-Maragha,Al-Maragha


In [94]:
# NOTE: Zemam Out is now matched (after using the combined table for 1 2 3 instead of just level 3 for cascading)
egypt_matched[egypt_matched.progres_locationlevel=="Zemam Out"]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,validTo,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,ValidTo,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
385,20EGY001000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY001000,2.0,Zemam Out,74F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG0100,6.0,351.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG0100,Cairo,القاهرة,EG01,Egypt,مِصر,EG,2006-01-01,2017-04-21,,2.752948,0.142263,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
386,20EGY002000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY002000,2.0,Zemam Out,76F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG0200,6.0,352.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG0200,Alexandria,الاسكندرية,EG02,Egypt,مِصر,EG,2006-01-01,2017-04-21,,1.559879,0.07636,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
387,20EGY013000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY013000,2.0,Zemam Out,78F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG1300,6.0,353.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG1300,Sharkia,الشرقية,EG13,Egypt,مِصر,EG,2006-01-01,2017-04-21,,1.707252,0.068287,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
388,20EGY014000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY014000,2.0,Zemam Out,7AF3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG1400,6.0,354.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG1400,Kalyoubia,القليوبية,EG14,Egypt,مِصر,EG,2006-01-01,2017-04-21,,0.701487,0.011721,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
389,20EGY018000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY018000,2.0,Zemam Out,7CF3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG1800,6.0,355.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG1800,Behera,البحيرة,EG18,Egypt,مِصر,EG,2006-01-01,2017-04-21,,3.028037,0.323599,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
390,20EGY021000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY021000,2.0,Zemam Out,7EF3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG2100,6.0,356.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG2100,Giza,الجيزة,EG21,Egypt,مِصر,EG,2006-01-01,2017-04-21,,16.02557,3.234751,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
391,20EGY022000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY022000,2.0,Zemam Out,80F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG2200,6.0,357.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG2200,Beni Suef,بنى سويف,EG22,Egypt,مِصر,EG,2006-01-01,2017-04-21,,7.585411,0.833681,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
392,20EGY023000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY023000,2.0,Zemam Out,82F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG2300,6.0,358.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG2300,Fayoum,الفيوم,EG23,Egypt,مِصر,EG,2006-01-01,2017-04-21,,6.920109,0.328431,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
393,20EGY024000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY024000,2.0,Zemam Out,84F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG2400,6.0,359.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG2400,Menia,المنيا,EG24,Egypt,مِصر,EG,2006-01-01,2017-04-21,,12.318939,2.643485,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,
394,20EGY028000,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Zemam Out,F_Zemam Out,S_Zemam Out,20EGY028000,2.0,Zemam Out,86F3AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG2800,6.0,360.0,Polygon,,,,,,,,,Zemam Out,خارج الزمام,EG2800,Aswan,أسوان,EG28,Egypt,مِصر,EG,2006-01-01,2017-04-21,,40.565933,4.981986,Zemam Out,<Null>,<Null>,<Null>,<Null>,<Null>,,,,,


### Egypt: Examine table matched with v4 and OCHA codes

In [95]:
syria_matched = syria[(~syria.admin1Pcode.isna()) & (~syria.len_pcode_v1.isna())]
sy_1_match = syria_matched[(syria_matched.progres_level==1)].progres_locationlevel.unique().tolist()
sy_2_match = syria_matched[(syria_matched.progres_level==2)].progres_locationlevel.unique().tolist()
sy_3_match = syria_matched[(syria_matched.progres_level==3)].progres_locationlevel.unique().tolist()

In [96]:
# good level 3
syria_matched[(syria_matched.progres_level==3) & (syria_matched.admin3Name_en== syria_matched.progres_locationlevel)].shape

(2, 32)

In [97]:
# bad level 3
syria_matched[(syria_matched.progres_level==3) & (syria_matched.admin3Name_en!= syria_matched.progres_locationlevel)].shape

(354, 32)

In [98]:
# ahmed
# bad level 3 details
# v4 data seems to list locations such as "Abdita" as level 3, when they are actually at a deeper level, i.e. level 4 (and the corresponding pcode in OCHA has a different name)
syria_matched[(syria_matched.progres_level==3) & (syria_matched.admin3Name_en!= syria_matched.progres_locationlevel)][["ocha_pcode", 
                                                                                                                       "admin2Name_en",
                                                                                                                      "admin3Name_en",
                                                                                                                      "progres_locationlevel"]].head(7)

Unnamed: 0,ocha_pcode,admin2Name_en,admin3Name_en,progres_locationlevel
16,SY020305,Afrin,Sheikh El-Hadid,Upper Marwana
17,SY020305,Afrin,Sheikh El-Hadid,Khadraa
18,SY020305,Afrin,Sheikh El-Hadid,Ada
20,SY020306,Afrin,Ma'btali,Upper Maskeh
21,SY020306,Afrin,Ma'btali,Khaldiyet Afrin
22,SY020306,Afrin,Ma'btali,Afrin
79,SY020401,A'zaz,Aghtrin,Abla


In [99]:
# Case in point ^^^ 
admin[admin.progres_locationlevel=="Abdita"]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha
90881,21SYR007005001,67C3DB59-6D90-E111-B138-005056AC02A7,Syrian Arab Republic,A_Abdita,F_Abdita,S_Abdita,21SYR007005001,3.0,Abdita,DB03B87E-628F-EC11-811F-00155D5CC4A0,14,14,SY070501,8


In [100]:
# same as above ^^^
syria_ocha[syria_ocha.admin3Pcode=="SY070501"].head()

Unnamed: 0,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,progres_level,ocha_pcode,admin2RefName,admin1RefName
1,Syrian Arab Republic,الجمهورية العربية السورية,SY,Idleb,إدلب,SY07,Ariha,أريحا,SY0705,Ehsem,احسم,SY070501,Ehsem,2016-09-05,2016-09-05,,3,SY070501,,


In [101]:
# good level 2
# this is fine because location level 2 is not provided in english in the ocha list
syria_matched[(syria_matched.progres_level==2) & (syria_matched.admin2Name_en== syria_matched.progres_locationlevel)].shape

(62, 32)

In [102]:
# good level 2
# this is fine because location level 2 is not provided in english in the ocha list
syria_matched[(syria_matched.progres_level==2) & (syria_matched.admin2Name_en== syria_matched.progres_locationlevel)][["ocha_pcode", 
                                                                                                                       "admin2Name_en",
                                                                                                                      # "admin3Name_en",
                                                                                                                      "progres_locationlevel"]].head(7)

Unnamed: 0,ocha_pcode,admin2Name_en,progres_locationlevel
3855,SY0100,Damascus,Damascus
3856,SY0200,Jebel Saman,Jebel Saman
3857,SY0202,Al Bab,Al Bab
3858,SY0203,Afrin,Afrin
3859,SY0204,A'zaz,A'zaz
3860,SY0205,Menbij,Menbij
3861,SY0206,Ain Al Arab,Ain Al Arab


In [103]:
# bad level 2
# this is fine because location level 2 is not provided in english in the ocha list
syria_matched[(syria_matched.progres_level==2) & (syria_matched.admin2Name_en!= syria_matched.progres_locationlevel)].shape

(0, 32)

In [104]:
# bad level 2
syria_matched[(syria_matched.progres_level==2) & (syria_matched.admin2Name_en!= syria_matched.progres_locationlevel)][["ocha_pcode", "admin2Name_en", "progres_locationlevel"]]

Unnamed: 0,ocha_pcode,admin2Name_en,progres_locationlevel


In [105]:
# bad level 1
syria_matched[(syria_matched.progres_level==1) & (syria_matched.admin1Name_en!= syria_matched.progres_locationlevel)]

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,admin0Name_en,admin0Name_ar,admin0Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin2Name_en,admin2Name_ar,admin2Pcode,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName_en,LastUpdateDate,validOn,validTo,admin2RefName,admin1RefName


In [106]:
# bad level 1
syria_matched[(syria_matched.progres_level==1) & (syria_matched.admin1Name_en== syria_matched.progres_locationlevel)].shape

(14, 32)

In [107]:
syria.columns

Index(['progres_code', 'progres_countryid', 'progres_countryidname',
       'progres_description_1025', 'progres_description_1036',
       'progres_description_3082', 'progres_filtercode', 'progres_level',
       'progres_locationlevel', 'progres_locationlevelid', 'len_pcode_v1',
       'len_pcode_v2', 'ocha_pcode', 'len_pcode_ocha', 'admin0Name_en',
       'admin0Name_ar', 'admin0Pcode', 'admin1Name_en', 'admin1Name_ar',
       'admin1Pcode', 'admin2Name_en', 'admin2Name_ar', 'admin2Pcode',
       'admin3Name_en', 'admin3Name_ar', 'admin3Pcode', 'admin3RefName_en',
       'LastUpdateDate', 'validOn', 'validTo', 'admin2RefName',
       'admin1RefName'],
      dtype='object')

## Save egypt and syria data

In [108]:
SERVER = ip_of_server
DATABASE = 'QualityCheck_Egypt'
DRIVER = 'SQL Server'
USERNAME = uap.split(',')[0]
PASSWORD = uap.split(',')[1]
DATABASE_CONNECTION = f'mssql://{USERNAME}:{PASSWORD}@{SERVER}/{DATABASE}?driver={DRIVER}'

In [109]:
engine = sqlalchemy.create_engine(DATABASE_CONNECTION)
connection = engine.connect()

In [110]:
egypt= egypt.drop(columns=["validTo", "ValidTo"])
# syria= syria.drop(columns=["validTo", "ValidTo"])


In [111]:
egypt.head()

Unnamed: 0,progres_code,progres_countryid,progres_countryidname,progres_description_1025,progres_description_1036,progres_description_3082,progres_filtercode,progres_level,progres_locationlevel,progres_locationlevelid,len_pcode_v1,len_pcode_v2,ocha_pcode,len_pcode_ocha,OBJECTID *,Shape *,admin3Name_en,admin3Name_ar,admin3Pcode,admin3RefName,admin3AltName1_en,admin3AltName2_en,admin3AltName1_ar,admin3AltName2_ar,admin2Name_en,admin2Name_ar,admin2Pcode,admin1Name_en,admin1Name_ar,admin1Pcode,admin0Name_en,admin0Name_ar,admin0Pcode,date,validOn,Shape_Length,Shape_Area,admin2RefName,admin2AltName1_en,admin2AltName2_en,admin2AltName1_ar,admin2AltName2_ar,admin1RefName,admin1AltName1_en,admin1AltName2_en,admin1AltName1_ar,admin1AltName2_ar
0,20EGY011a01,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_El Sroo,F_El Sroo,S_El Sroo,20EGY011a01,2.0,El Sroo,9566FBCA-B8E7-EB11-810A-00155D5CC459,11.0,11.0,EG1101,6.0,132.0,Polygon,,,,,,,,,Dumyat 1,قسم أول دمياط,EG1101,Damietta,دمياط,EG11,Egypt,مِصر,EG,2006-01-01,2017-04-21,0.127185,0.000707,Dumyat 1,<Null>,<Null>,<Null>,<Null>,,,,,
1,20EGY011001,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Dumyat 1,F_Dumyat 1,S_Dumyat 1,20EGY011001,2.0,Dumyat 1,C0F1AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG1101,6.0,132.0,Polygon,,,,,,,,,Dumyat 1,قسم أول دمياط,EG1101,Damietta,دمياط,EG11,Egypt,مِصر,EG,2006-01-01,2017-04-21,0.127185,0.000707,Dumyat 1,<Null>,<Null>,<Null>,<Null>,,,,,
2,20EGY011a02,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Izbet Elbarj,F_Izbet Elbarj,S_Izbet Elbarj,20EGY011a02,2.0,Izbet Elbarj,9766FBCA-B8E7-EB11-810A-00155D5CC459,11.0,11.0,EG1102,6.0,131.0,Polygon,,,,,,,,,Dumyat,مركز دمياط,EG1102,Damietta,دمياط,EG11,Egypt,مِصر,EG,2006-01-01,2017-04-21,2.218709,0.018665,Dumyat,<Null>,<Null>,<Null>,<Null>,,,,,
3,20EGY011002,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Dumyat,F_Dumyat,S_Dumyat,20EGY011002,2.0,Dumyat,BEF1AC72-E114-E911-80DE-001DD8B71FE4,11.0,11.0,EG1102,6.0,131.0,Polygon,,,,,,,,,Dumyat,مركز دمياط,EG1102,Damietta,دمياط,EG11,Egypt,مِصر,EG,2006-01-01,2017-04-21,2.218709,0.018665,Dumyat,<Null>,<Null>,<Null>,<Null>,,,,,
4,20EGY011a03,CFC1DB59-6D90-E111-B138-005056AC02A7,Egypt,A_Kafr Batikh,F_Kafr Batikh,S_Kafr Batikh,20EGY011a03,2.0,Kafr Batikh,9866FBCA-B8E7-EB11-810A-00155D5CC459,11.0,11.0,EG1103,6.0,150.0,Polygon,,,,,,,,,Fariskur,مركز فارسكور,EG1103,Damietta,دمياط,EG11,Egypt,مِصر,EG,2006-01-01,2017-04-21,0.627702,0.01076,Fariskur,<Null>,<Null>,<Null>,<Null>,,,,,


In [112]:
rename_columns = {  "ocha_pcode" : "ocha_v4_pcode",
                    'admin3Name_en' : 'admin3_name_ocha', 
                    'admin3Name_ar' : 'admin3_name_ar_ocha',
                    'admin3Pcode' : 'admin3_pcode_ocha', 
                    'admin2Name_en' : 'admin2_name_ocha', 
                    'admin2Name_ar' : 'admin2_name_ar_ocha',
                    'admin2Pcode' : 'admin2_pcode_ocha',
                    'admin1Name_en' : 'admin1_name_ocha',
                    'admin1Name_ar' : 'admin1_name_ar_ocha',
                    'admin1Pcode' : 'admin1_pcode_ocha', 
                    'admin0Name_en' : 'admin0_name_ocha',
                    'admin0Name_ar' : 'admin0_name_ar_ocha',
                    'admin0Pcode' : 'admi0_pcode_ocha'
                  }
ocha_columns = list(rename_columns.values())
columns_to_keep = [ "progres_countryidname", "progres_locationlevel", "progres_level", "progres_code"] + ocha_columns

In [113]:
egypt_renamed = egypt.rename(columns = rename_columns)
egypt_renamed = egypt_renamed[columns_to_keep]

syria_renamed = syria.rename(columns = rename_columns)
syria_renamed = syria_renamed[columns_to_keep]

In [114]:
egypt_renamed.head()

Unnamed: 0,progres_countryidname,progres_locationlevel,progres_level,progres_code,ocha_v4_pcode,admin3_name_ocha,admin3_name_ar_ocha,admin3_pcode_ocha,admin2_name_ocha,admin2_name_ar_ocha,admin2_pcode_ocha,admin1_name_ocha,admin1_name_ar_ocha,admin1_pcode_ocha,admin0_name_ocha,admin0_name_ar_ocha,admi0_pcode_ocha
0,Egypt,El Sroo,2.0,20EGY011a01,EG1101,,,,Dumyat 1,قسم أول دمياط,EG1101,Damietta,دمياط,EG11,Egypt,مِصر,EG
1,Egypt,Dumyat 1,2.0,20EGY011001,EG1101,,,,Dumyat 1,قسم أول دمياط,EG1101,Damietta,دمياط,EG11,Egypt,مِصر,EG
2,Egypt,Izbet Elbarj,2.0,20EGY011a02,EG1102,,,,Dumyat,مركز دمياط,EG1102,Damietta,دمياط,EG11,Egypt,مِصر,EG
3,Egypt,Dumyat,2.0,20EGY011002,EG1102,,,,Dumyat,مركز دمياط,EG1102,Damietta,دمياط,EG11,Egypt,مِصر,EG
4,Egypt,Kafr Batikh,2.0,20EGY011a03,EG1103,,,,Fariskur,مركز فارسكور,EG1103,Damietta,دمياط,EG11,Egypt,مِصر,EG


In [115]:
syria_renamed.head()

Unnamed: 0,progres_countryidname,progres_locationlevel,progres_level,progres_code,ocha_v4_pcode,admin3_name_ocha,admin3_name_ar_ocha,admin3_pcode_ocha,admin2_name_ocha,admin2_name_ar_ocha,admin2_pcode_ocha,admin1_name_ocha,admin1_name_ar_ocha,admin1_pcode_ocha,admin0_name_ocha,admin0_name_ar_ocha,admi0_pcode_ocha
0,Syrian Arab Republic,Damascus,1.0,21SYR001,SY01,,,,,,,Damascus,دمشق,SY01,Syrian Arab Republic,الجمهورية العربية السورية,SY
1,Syrian Arab Republic,Aleppo,1.0,21SYR002,SY02,,,,,,,Aleppo,حلب,SY02,Syrian Arab Republic,الجمهورية العربية السورية,SY
2,Syrian Arab Republic,Rural Damascus,1.0,21SYR003,SY03,,,,,,,Rural Damascus,ريف دمشق,SY03,Syrian Arab Republic,الجمهورية العربية السورية,SY
3,Syrian Arab Republic,Homs,1.0,21SYR004,SY04,,,,,,,Homs,حمص,SY04,Syrian Arab Republic,الجمهورية العربية السورية,SY
4,Syrian Arab Republic,Hama,1.0,21SYR005,SY05,,,,,,,Hama,حماة,SY05,Syrian Arab Republic,الجمهورية العربية السورية,SY


In [116]:
# # write the DataFrame to a table in the sql database
# egypt_renamed.to_sql("Egypt_locations", 
#                 engine, 
#                 # schema="dbo",
#                 if_exists='replace', # or append
#                 index=False
#                )

In [117]:
# # write the DataFrame to a table in the sql database
# syria_renamed.to_sql("Syria_locations", 
#                 engine, 
#                 # schema="dbo",
#                 if_exists='replace', # or append
#                 index=False
#                )

In [157]:
# syria_renamed.to_csv("Syria_matched_locations.csv", encoding = 'utf-8')
# egypt_renamed.to_csv("Egypt_matched_locations.csv", encoding = 'utf-8')