# proGres v4 Data Quality Script- Egypt (Active and Hold- All Years)

# Section 1: Setup / Import data

## 1.1 Notebook setup

#### Import libraries needed

In [None]:
import numpy as np
import pandas as pd
import pickle
import datetime as datetime
import pyodbc
import urllib
import sqlalchemy
import requests
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 0)

#### Set Server ip, Date Extracted and ISO code for Country of Interest

In [None]:
# To change before running
ip_of_server = "**.***.**.**"

# date for data extraction from proGres v4

# if using current date, use this: 
# date_extracted = datetime.datetime.now()

# else if data was extracted on a different date use this:
date_extracted = pd.Timestamp("2022-09-14")

# set date for script version
script_version = str(pd.Timestamp(datetime.datetime.now()))

# set country chosen for data quality check
coa_country_iso_code = "EGY"
coo_country_iso_code = "SYR"

## 1.2 Import All Modules / Tables

In [None]:
## Use commented lines to save username and password in text1 and text2 respectively if needed
# text1 = ''
# text2 = ''
saveFile = open('pwd.txt', 'r+')
# saveFile.write(text1 + ',' + text2)
# saveFile.seek(0)
uap = saveFile.read()

In [None]:
# Connect to server
driver = 'SQL Server'
ip = ip_of_server
db_connection = pyodbc.connect(
                                driver=driver,
                                Server=ip,
                                Port="1433",
                                Database="QualityCheck_Egypt",
                                UID=uap.split(',')[0],
                                PWD=uap.split(',')[1]
                                )

# check
# pyodbc.drivers()

# check
# type(db_connection)

#### Individual

In [None]:
# Read in the individual table as dfi
dfi = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_individual", 
                        db_connection)
dfi.shape

#### Registration Groups

In [None]:
# Read in the registration group table as dfr
dfr = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_registrationgroup", 
                        db_connection)
dfr.shape

#### Focal Points (for each Registration Group) 

In [None]:
# dfi_fp is a subset of dfi for only focal points
# we join this with dfr to get dfr_fp:
# dfr_fp is the registration group table appended with demographic characteristics of focal points (if listed) for each registration group

dfi_fp = dfi.copy()
dfi_fp = dfi_fp[dfi_fp.progres_relationshiptofpname == "Focal Point"]
dfi_fp_small = dfi_fp[[
                     "progres_relationshiptofpname",
                     "progres_registrationgroupid",
                     "progres_individualid",
                     'progres_countryofasylumidname', 
                     'progres_coalocationlevel1name',
                     'progres_coalocationlevel2name',
                     'progres_coalocationlevel3name',
                     'progres_countryoforiginidname', 
                     'progres_coolocationlevel1name',
                     'progres_coolocationlevel2name',
                     'progres_coolocationlevel3name',
                     'progres_maritalstatusname',
                     'progres_sex', 
                     'progres_primaryphonenumber',
                     'progres_hasphoto'
                      ]]

dfr_fp = pd.merge(dfr, 
                  dfi_fp_small,
                  how="left", 
                  on="progres_registrationgroupid")

#### Documents

In [None]:
# read in the documents table as dfd
dfd = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_document", 
                        db_connection)
dfd.shape

#### Address

In [None]:
# read in the address table as dfa
dfa = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_address", 
                        db_connection)
dfa.shape

#### Specific Needs

In [None]:
# Read in the specific needs table as dfs
dfs = pd.read_sql_query("SELECT * FROM dbo.Filteredprogres_specificneed", 
                        db_connection)
dfs.shape

#### Combined Location-Pcode Tables

In [None]:
coa_addresses = pd.read_sql_query("SELECT * FROM dbo.Egypt_locations",
                                  db_connection)

In [None]:
coo_addresses = pd.read_sql_query("SELECT * FROM dbo.Syria_locations",
                                  db_connection)

In [None]:
# Matched addresses are those where admin1_pcode_ocha is NOT Null (NOTE the '~' preceding condition)
coa_matched = coa_addresses[
                            (~coa_addresses.admin1_pcode_ocha.isna())  # & # if null, this pcode exists in v4 but not in OCHA
                            # (~coa_addresses.progres_locationlevel.isna())  # if null, this pcode exists in OCHA but not in v4 ( can comment out to include these if we're checking the levels also)
                            ]
coo_matched = coo_addresses[
                            (~coo_addresses.admin1_pcode_ocha.isna())  # & # if null, this pcode exists in v4 but not in OCHA
                            # (~coo_addresses.len_pcode_v1.isna())  # if null, this pcode exists in v4 but not in OCHA ( cam comment out to include this if we're checking the levels also)
                            ]

In [None]:
# Get lists of UNHCR location names and OCHA location names at each level:
coa_names_1 = coa_matched[coa_matched.progres_level == 1].progres_locationlevel.unique().tolist()
coa_names_1_ocha = coa_matched[coa_matched.progres_level == 1].admin1_name_ocha.unique().tolist()

coa_names_2 = coa_matched[coa_matched.progres_level == 2].progres_locationlevel.unique().tolist()
coa_names_2_ocha = coa_matched[coa_matched.progres_level == 2].admin2_name_ocha.unique().tolist()

coa_names_3 = coa_matched[coa_matched.progres_level == 3].progres_locationlevel.unique().tolist()
coa_names_3_ocha = coa_matched[coa_matched.progres_level == 3].admin3_name_ocha.unique().tolist()

coo_names_1 = coo_matched[coo_matched.progres_level == 1].progres_locationlevel.unique().tolist()
coo_names_1_ocha = coo_matched[coo_matched.progres_level == 1].admin1_name_ocha.unique().tolist()

coo_names_2 = coo_matched[coo_matched.progres_level == 2].progres_locationlevel.unique().tolist()
coo_names_2_ocha = coo_matched[coo_matched.progres_level == 2].admin2_name_ocha.unique().tolist()

coo_names_3 = coo_matched[coo_matched.progres_level == 3].progres_locationlevel.unique().tolist()
coo_names_3_ocha = coo_matched[coo_matched.progres_level == 3].admin3_name_ocha.unique().tolist()

## 1.3 Clean up tables as needed (statuscode, business unit, name of progres_id field etc.)

#### Individual

In [None]:
# Filter for statuscode [1, 125080000] 
# which correspond to statuscodename [Active, Hold] respectively
dfi = dfi[dfi.statuscode.isin([1, 125080000])]

# Only keep MENA business units
list_mena_bu = ['Mauritania - Bassikounou',
                'Mauritania - Urban',
                'Tunisia - CO',
                'Egypt - CO',
                'Lebanon - North',
                'Lebanon - BML',
                'Lebanon - South',
                'Lebanon - Bekaa',
                'Algeria - CO',
                'Iraq - CO',
                'Israel - CO',
                'Jordan - Camps',
                'Jordan - Urban',
                'Kuwait - CO',
                'Morocco - CO',
                'RO Riyadh',
                'Syria - CO',
                'UAE - CO']

dfi = dfi[dfi.progres_businessunitname.isin(list_mena_bu)]

#### Registration Group

In [None]:
dfr = dfr[dfr.progres_businessunitname.isin(list_mena_bu)]

#### Address

In [None]:
# keep only statuscode = 1
dfa = dfa[dfa.statuscode == 1]

#### Specific Needs

In [None]:
# rename individual to individualid
dfs = dfs.rename(columns={"progres_individual" : "progres_individualid"})

In [None]:
# Filters: keep only Specific Needs Statusname = Valid and Statuscode = 1
dfs = dfs[(dfs.progres_specificneedstatusname == 'Valid') &
          (dfs.statuscode == 1)]

## 1.4 Examine / Drop any duplicated entries

In [None]:
# dfs[~dfs.duplicated(keep="first") == True].shape

In [None]:
## remove duplicates from specific needs table

# dfi = dfi[~dfi.duplicated(keep="first")]
# dfr = dfr[~dfr.duplicated(keep="first")]
# dfd = dfd[~dfd.duplicated(keep="first")]
# dfa = dfa[~dfa.duplicated(keep="first")]
dfs = dfs[~dfs.duplicated(keep="first")]

## 1.5 Keep only those document, address, and specific needs records that have a link to the individual table

In [None]:
start = time.time()

In [None]:
dfi_ids = dfi.progres_individualid.tolist()
dfd = dfd[dfd.progres_individualid.isin(dfi_ids)]
dfa = dfa[dfa.progres_individualid.isin(dfi_ids)]
dfs = dfs[dfs.progres_individualid.isin(dfi_ids)]

In [None]:
end = time.time()
print(end - start)

# Section 2: Record Inconsistencies

## 2.0 Specify Columns to include in the table of irregularities

In [None]:
# for the individual table
qc_cols = ["progres_individualid"]

# for the registration table
r_qc_cols = ["progres_registrationgroupid"]

# fields to append from the individual table
# to append once all inconsistencies have been extracted
# these are fields that go into the PowerBi dashboard
qc_cols_append = [
                   "progres_registrationdate",
                   "progres_arrivaldate",
                   "createdon",
                   "modifiedon",
                   "progres_individualid", 
                   "progres_id",
                   "progres_businessunitname",
                   "progres_countryoforiginidname",
                   "progres_countryofasylumidname",
                   "progres_refugeestatusname",
                   "progres_dateofbirth",
                   "progres_sexname",
                   "progres_age",
                   "progres_agecohortname",
                   "createdbyname", 
                   "modifiedbyname",
                   ]


# fields to append from the registration group table
# once all inconsistencies have been extracted
# these are fields that go into the PowerBi dashboard
r_qc_cols_append = [
                     'createdon',
                     'modifiedon',
                     'progres_registrationdate',
                     'progres_registrationgroupid', 
                     'progres_registrationgroupbusinessid',
                     'progres_businessunitname',
                     "progres_countryofasylumidname",
                     'progres_countryoforiginidname',
                     'createdbyname',
                     'modifiedbyname'
                   ]

## 2.1. Individual Inconsistencies

### 2.1.1 Fields with missing values in the individual table

In [None]:
# list of fields to check for missing values
i_missing = ['progres_relationshiptofp', 
             'progres_age',
             # 'progres_refugeestatus', # removed
             'progres_familyname',
             'progres_givenname',
             'progres_maritalstatusname',
             'progres_registrationgroupid',
             'progres_sex',
             'progres_arrivaldate',
             'progres_registrationdate',
             'progres_registrationreason',
             'progres_fathersname', 
             'progres_dateofbirth',
             'progres_countryoforiginidname',
             'progres_countryofasylumidname',
             'progres_nationalitylookupname',
             'progres_refugeestatusname',
             'progres_religionidname',
             'progres_ethnicityidname',
             # 'progres_hasphoto', # removed
             # 'progres_biometricstatus', # removed
             'progres_placeofbirthidname',
             'progres_placeofbirthcity', 
             'progres_fleddate'
             ]

In [None]:
# description of corresponding fields
i_missing_txt = ["Individual's Relationship to Focal Point",
                 "Individual's Age",
                 # "Individual's Refugee Status", # removed
                 'Family Name of Individual',
                 'Given Name of Individual',
                 'Marital Status of Individual',
                 'Registration Group ID of Individual',
                 'Sex of Individual',
                 'Arrival Date of Individual',
                 'Registration Date of Individual',
                 'Registration Reason of Individual',
                 "Father's Name",
                 'Date of Birth of Individual',
                 'Country of Origin for Individual',
                 'Country of Asylum for Individual',
                 'Nationality of Individual',
                 'Refugee Status of Individual',
                 'Religion of Individual',
                 'Ethnicity of Individual',
                 # 'The Has Photo field for Individual', # removed
                 # 'Biometric Status of Individual', # removed
                 'Place of Birth of Individual',
                 'Place of Birth (City) of Individual',
                 'Individual Fled Date'
                 ]

In [None]:
# zip the two lists together into a dictionary
i_missing_dict = dict(zip(i_missing, 
                          i_missing_txt))

In [None]:
i_missing_dict

In [None]:
df = pd.DataFrame()
indicators_checked = pd.DataFrame()

# For each field, 
# 1) identify the individuals for whom values are missingAND
# 2) make note of fields checked for missing values in the "indicators_checked" table along with other info about this indicator

for field in i_missing:
    # 1)  identify the individuals for whom values are missing
    dftemp = pd.DataFrame()
    dftemp["progres_individualid"] = dfi[(dfi[field].isna()) | (dfi[field] == '-')][qc_cols]
    dftemp["Irregularity"] = "{} is missing or '-'".format(i_missing_dict[field])
    # append dftemp (which has information on individual id and related "Irregularity") to existing df
    df = pd.concat([df, dftemp])
    
    # 2) add field to "indicators_checked" table along with other info about this indicator
    temp_indicators = pd.DataFrame()
    temp_indicators.loc[1,"Irregularity"] = "{} is missing or '-'".format(i_missing_dict[field])
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "{} is null or '-'".format(field)
    temp_indicators.loc[1,"Grouping"] = "missing information"
    # append temp_indicators table (with information on the indicator screened in each iteration) to the indicators_checked table
    indicators_checked = pd.concat([indicators_checked, temp_indicators])  

In [None]:
df.Irregularity.value_counts()

### 2.1.2. Other individual inconsistencies

### Individual Functions

In [None]:
'''
    Each of the functions below takes as 
    input: pandas DataFrame (relevant table from proGres needed to check for given inconsistency)
    and 
    outputs: 
    (1) dftemp (pandas DataFrame) that identifies individual ids (progres_individualid) associated with records that have the relevant irregularity (Irregularity)
    (2) temp_indicators, which lists the indicator name along with additional information about the indicator, such as:
    - proGres table: string which describes the proGres table(s) from which records were searched for the relevant irregularity 
    - Criteria: string which describes the criteria applied to filter records for this irregularlity
    - Grouping: The overall grouping/category name that would characterize the type of irregularity
    
    Also Note: [qc_cols] specified in section 1.1. lists columns from the relevant proGres table needed to construct the summary tables needed for the PowerBi dashboard
'''

def ind_wo_reg():
    dftemp = dfi[(dfi.progres_registrationgroupid.isna())|
            (dfi.progres_registrationgroupid == '-')][qc_cols]
    name_of_irr = 'This individualID does not belong to any registration group'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_registrationgroupid is null or '-'"
    temp_indicators.loc[1,"Grouping"] = "missing information"
    
    return dftemp, temp_indicators

def missing_biometric():
    dftemp = dfi[(dfi.progres_biometricstatus.isna())&
            (dfi.progres_age>=5) # & # changed back to 5 after consultation with Reg&IM team
            # (dfi.progres_relationshiptofpname == "Focal Point") 
             ][qc_cols]
    name_of_irr = 'Individual (age>=5) with missing biometricstatus'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_biometricstatus is null for individual 5 years or older"
    temp_indicators.loc[1,"Grouping"] = "missing information"
    
    return dftemp, temp_indicators
    
def reg_date_future():
    dftemp = dfi[dfi.progres_registrationdate>date_extracted][qc_cols]    
    name_of_irr = 'The registration date of this individual is in the future'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_registrationdate is later than date extracted"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators

def arr_date_reg_date():
    dftemp = dfi[dfi.progres_arrivaldate>dfi.progres_registrationdate][qc_cols]
    name_of_irr = 'The date of arrival is later than the registration date'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_arrival date is later than progres_registrationdate"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators


def coo_coa_not_noc():
    dftemp = dfi[(~dfi.progres_refugeestatusname.isin(["Not of concern", 
                                                       "Other of concern",
                                                       # "Stateless (non-refugee)"
                                                      ]
                                                     ))&
                 (dfi.progres_countryoforiginid == dfi.progres_countryofasylumid) &
                 (~dfi.progres_countryoforiginid.isna())][qc_cols] 
    name_of_irr = "Individual has COO = COA but refugee status is Refugee or Asylum Seeker or Stateless"
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_countryoforiginid == progres_countryofasylumid but progres_refugeestatus is not NOC or OOC"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return dftemp, temp_indicators

def deceased_active_hold():
    dftemp = dfi[(dfi.progres_isdeceased == True)&(dfi.statuscode.isin([1,125080000]))][qc_cols]
    name_of_irr = 'The individual is marked as deceased but is still active/on hold'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_isdeceased is True but statuscode is Active or On Hold"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return dftemp, temp_indicators
    
def ind_in_multiple_rg():
    ind_in_multiple_reg = dfi.groupby(["progres_individualid"])["progres_registrationgroupid"]\
                                    .count()\
                                    .reset_index()\
                                    .sort_values(by="progres_registrationgroupid", ascending=False)
    ind_in_multiple_reg_list = ind_in_multiple_reg[ind_in_multiple_reg.progres_registrationgroupid>1]\
                                    .progres_individualid.tolist()
    dftemp = dfi[dfi.progres_individualid.isin(ind_in_multiple_reg_list)][qc_cols]
    name_of_irr = 'This individual id number belongs to more than one registration group' 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "groupby on progres_individualid lists multiple progres_registrationgroupid"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return dftemp, temp_indicators

def rel_fp_sex():
    female_rel = ['Aunt', 'Common Law Wife', 'Cousin (female)', 'Daughter',
                   'Ex-wife', 'Focal Point', 'Foster daughter', 'Granddaughter',
                   'Grandmother', 'Half-sister', 'In Law - Sister', 'In-Law (female)',
                   'In-Law - Daughter', 'In-Law - Mother', 'Mother', 'Niece',
                   'No blood relation (female)', 'Not specified/unknown (female)',
                   'Other blood relation (female)', 'Partner (Female)', 'Sister',
                   'Step-daughter', 'Step-mother', 'Step-sister', 'Wife']
    male_rel = ['Son','Brother', 'Common Law Husband', 'Cousin (male)', 'Father',
                   'Foster father', 'Foster son', 'Grandfather', 'Grandson',
                   'Half-brother', 'Husband', 'In Law - Brother', 'In-Law (male)',
                   'In-Law - Father', 'In-Law - Son', 'Nephew',
                   'No blood relation (male)', 'Other blood relation (male)',
                   'Partner (Male)', 'Step-brother', 'Step-father', 'Step-son',
                   'Uncle']
    dftemp = dfi[(((dfi.progres_sexname == "Female")&(dfi.progres_relationshiptofpname.isin(male_rel)))|
                  ((dfi.progres_sexname == "Male")&(dfi.progres_relationshiptofpname.isin(female_rel))))&
                 (dfi.progres_relationshiptofpname != "Focal Point")][qc_cols]
    name_of_irr = 'The Relationship to Focal Point does not match individual sex' 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "(progres_sexname == 'Female'& dfi.progres_relationshiptofpname is a male relative) OR \
                                    (progres_sexname == 'Male' and and dfi.progres_relationshiptofpname is a female relative)"
    temp_indicators.loc[1,"Grouping"] = "relationship issue"
    
    return dftemp, temp_indicators

def fp_younger_children():
    fp_age = dfi[dfi.progres_relationshiptofpname == "Focal Point"][["progres_individualid", 
                                                                   "progres_registrationgroupid", 
                                                                   "progres_relationshiptofpname", 
                                                                   "progres_age"]]
    r_age = dfi[dfi.progres_relationshiptofpname.isin(["Son","Daughter"])][["progres_registrationgroupid", 
                                                                            "progres_relationshiptofpname", 
                                                                            "progres_age"]]
    dftemp = pd.merge(fp_age, r_age, how="inner", on="progres_registrationgroupid")
    fp_age_issue = dftemp[dftemp.progres_age_x < dftemp.progres_age_y].progres_individualid.tolist()
    dftemp = dfi[dfi.progres_individualid.isin(fp_age_issue)][qc_cols]
    name_of_irr = 'Focal point is younger than his/her son/daughter'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "link individuals with relationship listed as Son or Daughter to their respective focal points and compare their ages. \
                          Flag as Irregularity if progres_age_x (age of focal point) < progres_age_y (age of son or daughter) "
    temp_indicators.loc[1,"Grouping"] = "relationship issue"
    
    return dftemp, temp_indicators

def fp_older_parents():
    fp_age = dfi[dfi.progres_relationshiptofpname == "Focal Point"][["progres_individualid", 
                                                                   "progres_registrationgroupid", 
                                                                   "progres_relationshiptofpname", 
                                                                   "progres_age", 
                                                                   "progres_dateofbirth"]]
    r_age = dfi[dfi.progres_relationshiptofpname.isin(["Mother","Father"])][["progres_registrationgroupid", 
                                                                         "progres_relationshiptofpname", 
                                                                         "progres_age", 
                                                                         "progres_dateofbirth"]]
    dftemp = pd.merge(fp_age, r_age, how="inner", on="progres_registrationgroupid")
    fp_age_issue = dftemp[dftemp.progres_age_x > dftemp.progres_age_y].progres_individualid.tolist()
    dftemp = dfi[dfi.progres_individualid.isin(fp_age_issue)][qc_cols]
    name_of_irr = 'Focal point is older than his/her mother/father' 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "link individuals with relationship listed as Mother or Father to their respective focal points and compare their ages. \
                          Flag as Irregularity if progres_age_x (age of focal point) > progres_age_y (age of mother or father) "
    temp_indicators.loc[1,"Grouping"] = "relationship issue"
    
    return dftemp, temp_indicators

def fp_hw_not_married():
    dftemp = dfi[((dfi.progres_relationshiptofpname == "Husband")|
               (dfi.progres_relationshiptofpname == "Wife"))&
                 (~dfi.progres_maritalstatusname.isin(["Married", "Common Law Married"]))][qc_cols]
    name_of_irr = "Husband or Wife of Focal Point with Marital Status other than Married or Common Law Married" 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "records where progres_relationshiptofpname is Husband or Wife, \
                          but individual record has a different marital status\
                          other than Married or Common Law Married"
    temp_indicators.loc[1,"Grouping"] = "relationship issue"
    
    return dftemp, temp_indicators

def active_noc():
    dftemp = dfi[(dfi.statuscodename == "Active")&(dfi.progres_refugeestatusname == "Not of concern")][qc_cols]
    name_of_irr = "Active Individual with Legal Status 'Not of Concern'" 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "records where statuscode is Active but refugee status is Not of Concern "
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return dftemp, temp_indicators

def dates_new_birth():
    dfi_nb = dfi[dfi.progres_registrationreasonname == "New Birth"]
    # date of birth not equal to arrival date OR
    # date of arrival not equal to fled date OR
    # date of birth not equal to fled date
    dftemp = dfi_nb[(dfi_nb.progres_dateofbirth != dfi_nb.progres_arrivaldate)|
                       (dfi_nb.progres_arrivaldate != dfi_nb.progres_fleddate)|
                       (dfi_nb.progres_dateofbirth != dfi_nb.progres_fleddate)][qc_cols]
    name_of_irr = "Mismatched date of birth, arrival date, and fled date for Individuals with Registration Reason = 'New Birth'" 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "For individuals with progres_registrationreasonname=='New Birth', date of birth, arrival date and fled date do not match"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators

def fled_arrival_date():
    dftemp = dfi[dfi.progres_fleddate > dfi.progres_arrivaldate][qc_cols]    
    name_of_irr = "Individual with Fled Date after Arrival Date"
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_fleddate is later than progres_arrivaldate"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators

def fled_reg_date():
    dftemp = dfi[dfi.progres_fleddate > dfi.progres_registrationdate][qc_cols]
    name_of_irr =  "Individual with Fled Date after Registration Date" 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_fleddate is later than progres_registrationdate"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators
   

def dob_reg_date():
    dftemp = dfi[dfi.progres_dateofbirth > dfi.progres_registrationdate][qc_cols]
    name_of_irr = "Individual with Date of Birth after Registration Date" 
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_dateofbirth is later than progres_registrationdate"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators
    

# # removed after consultations with Reg&IM
# def arrival_recent_reg():
#     dftemp = dfi.copy()
#     # year of arrival
#     dftemp["yoa"] = dftemp["progres_arrivaldate"].apply(lambda x: x.year) 
#     # year of birth
#     dftemp["yob"] = dftemp["progres_dateofbirth"].apply(lambda x: x.year)
#     # year of registration
#     dftemp["yor"] = dftemp["progres_registrationdate"].apply(lambda x: x.year)
#     dftemp = dftemp[(dftemp.yoa<2000)& # arrival before the year 2000
#                     (dftemp.yor>=2010)& # registration on or after the year 2010
#                     (dftemp.yoa != dftemp.yob)& # year of arrival not equal to year of birth
#                     (dftemp.progres_nationalitylookupname != dftemp.progres_countryofasylumidname) # nationality not same as COA
#                    ][qc_cols]
#     name_of_irr = "Individual arrival date is before 2000 but recently registered after 2010" 
#     dftemp["Irregularity"] = name_of_irr

#    temp_indicators.loc[1,"Irregularity"] = name_of_irr
#    temp_indicators.loc[1,"proGres_table"] = "individual"
#    temp_indicators.loc[1,"Criteria"] = "Year of Arrival before 2000 but registered after 2010 and year of arrival not equal to year of birth and nationality not same as COA"
#    temp_indicators.loc[1,"Grouping"] = "status issue"
    
#    return dftemp, temp_indicators


def coa_bu_mismatch():
    temp = dfi.copy()
    temp["COA_1"] = temp["progres_businessunitname"].apply(lambda x: x.split('-')[0][:-1])
    temp["COA_1"] = temp["COA_1"].apply(lambda x: "GCC except UAE" if x == "RO Riyad" else x)
    dftemp = temp[(temp.COA_1 != dfi.progres_countryofasylumidname)&(temp.COA_1 != "GCC except UAE")&(~temp.COA_1.isin(["Syria","UAE"]))][qc_cols]
    name_of_irr = 'Country of Asylum does not match country of business unit'
    dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_businessunitname does not match progres_countryofasylumidname\
                         mismatch between the two values (accounting for the fact that some will have the following values: \
                         RO Riyad, GCC except UAE, Syria, UAE instead)"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return dftemp, temp_indicators

def ind_age():
    temp = dfi.copy()
    # 1 (convert string to timestamps)
    temp['dob'] = pd.to_datetime(temp['progres_dateofbirth'], format='%m%d%y')    
    # 2 (correct for format %m%d%y so year=52 will be read as 1952 instead of 2052
    temp['dob'] = temp['dob'].where(temp['dob'] < date_extracted, temp['dob'] -  np.timedelta64(100, 'Y'))    
    # 3 subtract dob from date_extracted to obtain timedelta
    temp['calculated_age'] = (date_extracted - temp['dob']).astype('<m8[Y]')
    # month of birth
    temp["mob"] = temp["dob"].apply(lambda x: x.month)
    # exact DD, i.e. day of birth
    temp["dayob"] = temp["dob"].apply(lambda x: x.day)
    # tag as irregularity if the difference between the age in individual table and calculated age is greater than 1
    dftemp = temp[(temp.progres_age != temp.calculated_age)&
            (abs(temp.progres_age - temp.calculated_age)>0)&
             (temp.mob != date_extracted.month)&
             (temp.dayob != date_extracted.day)
                 ][qc_cols]
    name_of_irr = "Individual with erroneous age"
    dftemp["Irregularity"] = name_of_irr
        
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_age does not match age calculated based on progres_dateofbirth and date_extracted"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return dftemp, temp_indicators
    

def ind_photo_missing():
    dftemp = dfi[(dfi.progres_hasphoto == 0)|
                   (dfi.progres_hasphoto.isna())][qc_cols]
    name_of_irr = 'Individual does not have a photo' 
    dftemp["Irregularity"] = name_of_irr
        
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual"
    temp_indicators.loc[1,"Criteria"] = "progres_hasphoto for focal point is null or 0"
    temp_indicators.loc[1,"Grouping"] = "missing information"
    
    return dftemp, temp_indicators

In [None]:
# List of function names corresponding to irregularities we want to check for:
indicators_functions = [ind_wo_reg, 
                        missing_biometric, 
                        reg_date_future,
                        arr_date_reg_date,
                        coo_coa_not_noc,
                        deceased_active_hold,
                        ind_in_multiple_rg,
                        rel_fp_sex, 
                        fp_younger_children,
                        fp_older_parents,
                        fp_hw_not_married,
                        active_noc,
                        dates_new_birth,
                        fled_arrival_date,
                        fled_reg_date,
                        dob_reg_date,
                        # arrival_recent_reg,
                        coa_bu_mismatch,
                        ind_age,
                        ind_photo_missing]

# Loop through list of functions and save outputs in df table and indicators_checked table: 
for funcs in indicators_functions :
    func_returns = funcs()
    
    dftemp = func_returns[0]
    df = pd.concat([df, dftemp])
    
    temp_indicators = func_returns[1]
    indicators_checked = pd.concat([indicators_checked, temp_indicators])

### To find cases with this Irregularity

In [None]:
irregularity_name = "Individual has COO = COA but refugee status is Refugee or Asylum Seeker"
irregularity_table = df
search_table = dfi
id_needed = "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head(1)

In [None]:
irregularity_name = "Individual with erroneous age"
irregularity_table = df
search_table = dfi
id_needed = "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues[["progres_age", "progres_dateofbirth"]].head(1)

In [None]:
irregularity_name = "Individual with Date of Birth after Registration Date"
irregularity_table = df
search_table = dfi
id_needed = "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues[["progres_indvidualid", "progres_registrationdate", "progres_dateofbirth", "progres_refugeestatusname", "statuscodename"]].head(2)

## 2.2. Registration Inconsistencies

#### Create input tables to be used for registration group quality checks

In [None]:
# use the r_i_join_all table for the following two functions in the subsequent section:
# closed_reg_w_active_ind AND
# active_rg_wo_active_ind

# this table will have fields from the individual table appended to the registration group table
r_i_join_all = pd.merge(dfr[["progres_registrationgroupid","statuscode"]], 
                   dfi, 
                   how="left", 
                   on=["progres_registrationgroupid"])

# filter out everything except active / hold for all other functions / dq checks
dfr = dfr[dfr.statuscode.isin([1, 125080000])]

# this table will have fields from the individual table appended to the registration group table (but only for active/hold registration groups)
r_i_join = pd.merge(dfr[["progres_registrationgroupid","statuscode"]], 
                   dfi, 
                   how="left", 
                   on=["progres_registrationgroupid"])

### Registration functions

In [None]:
'''
    Each of the functions below takes as 
    input: pandas DataFrame (relevant table from proGres needed to check for given inconsistency)
    and 
    outputs: 
    (1) r_dftemp (pandas DataFrame) that identifies registration ids (progres_registrationgroupid) associated with records that have the relevant irregularity (Irregularity)
    (2) temp_indicators, which lists the indicator name along with additional information about the indicator, such as:
    - proGres table: string which describes the proGres table(s) from which records were searched for the relevant irregularity 
    - Criteria: string which describes the criteria applied to filter records for this irregularlity
    - Grouping: The overall grouping/category name that would characterize the type of irregularity
    
    Also Note: [r_qc_cols] specified in section 1.1. lists columns from the relevant proGres table needed to construct the summary tables needed for the PowerBi dashboard
'''


def closed_reg_w_active_ind():
    # closed, inactive, erroneous
    closed_or_inactive_list = r_i_join_all[(r_i_join_all.statuscode_x.isin([125080001, 
                                                             125080002, 
                                                             2]))&
                                      (r_i_join_all.statuscode_y.isin([1, 125080000]))].progres_registrationgroupid.tolist()
    r_dftemp = dfr[dfr.progres_registrationgroupid.isin(closed_or_inactive_list)][r_qc_cols]
    name_of_irr = "Closed or Inactivated registration with at least one individual active or on hold"
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged with individual table on progres_registrationgroupid"
    temp_indicators.loc[1,"Criteria"] = "join registration table with individual table on registrationgroupid; \
                          flag as irregularity if statuscode_x is closed, inactive, or erroeneous but statuscode_y is active"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return r_dftemp, temp_indicators
    

def active_rg_wo_active_ind():
    r_dftemp = r_i_join[(r_i_join.progres_individualid.isna())&
                       (r_i_join.statuscode_x.isin([1, 125080000])) #recently added
                       ][r_qc_cols]
    name_of_irr = "Active Registration Group with No Active Individual"
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged with individual table on progres_registrationgroupid"
    temp_indicators.loc[1,"Criteria"] = "join registration table with individual table on registrationgroupid; \
                            flag as irregularity if statuscode_x is active but progres_individualid is null"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return r_dftemp, temp_indicators

def rg_size_mismatch():
    reg_size = dfi.groupby(["progres_registrationgroupid"])\
                .progres_individualid\
                .count()\
                .reset_index()\
                .sort_values(by="progres_individualid", ascending=False)\
                .rename(columns={"progres_individualid":"rg_size"})
    dfr_size = pd.merge(dfr, reg_size, how="inner", on="progres_registrationgroupid")
    r_dftemp = dfr_size[(dfr_size.progres_size != dfr_size.rg_size)][r_qc_cols]
    name_of_irr = "Size of registration group not equal to no. of active individuals in the group"
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged with individual table on progres_registrationgroupid"
    temp_indicators.loc[1,"Criteria"] = "progres_size on registration table not equal to number of active individuals linked to the registration group"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return r_dftemp, temp_indicators

def reg_date_future():
    r_dftemp = dfr[dfr.progres_registrationdate >date_extracted][r_qc_cols]
    name_of_irr = "RegistrationGroup with registration date in the future"
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration"
    temp_indicators.loc[1,"Criteria"] = "progres_registrationdate later than date extracted"
    temp_indicators.loc[1,"Grouping"] = "date issue"
    
    return r_dftemp, temp_indicators
   

def rg_wo_active_fp():
    r_dftemp = dfr_fp[dfr_fp.progres_individualid.isna()][r_qc_cols]
    name_of_irr = "Active Registration Group without active focal point"
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged on progres_registrationgroupid with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_individualid for individual listed as focal point did not have a match in the registration table"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return r_dftemp, temp_indicators
    
def rg_w_more_fp():
    rg_fp = dfr_fp\
            .groupby(["progres_registrationgroupid"])\
            ["progres_individualid"]\
            .count()\
            .reset_index()\
            .sort_values(by="progres_individualid",ascending=False)
    rg_fp = rg_fp[rg_fp.progres_individualid>1]
    rgid_with_multiple_fp = rg_fp.progres_registrationgroupid.tolist()
    r_dftemp = dfr[dfr.progres_registrationgroupid.isin(rgid_with_multiple_fp)][r_qc_cols]
    name_of_irr = 'Registration group has more than one focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "count number of individual records with relationshiptofpname ==  'Focal Point' linked to a registration group\
                            flag as irregularity if count is more than 1"
    temp_indicators.loc[1,"Grouping"] = "status issue"
    
    return r_dftemp, temp_indicators

def fp_missing_phonenumber():
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
                      ((dfr_fp.progres_primaryphonenumber.isna())|
                       (dfr_fp.progres_primaryphonenumber == '-'))][r_qc_cols]
    name_of_irr = 'Focal Point has no phone number'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_primaryphonenumber for focal point is null or '-'"
    temp_indicators.loc[1,"Grouping"] = "missing information"
    
    return r_dftemp, temp_indicators

def fp_coo_1_missing():
    rids = dfr_fp[(~dfr_fp.progres_individualid.isna())&
              ((dfr_fp.progres_coolocationlevel1name.isna())|
               (dfr_fp.progres_coolocationlevel1name == "-"))].progres_registrationgroupid.tolist()
    r_dftemp = dfr[dfr.progres_registrationgroupid.isin(rids)][r_qc_cols]
    name_of_irr = 'The Country of Origin Level 1 is missing for focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coolocationlevel1name is null or '-'"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coa_1_missing():
    rids = dfr_fp[(~dfr_fp.progres_individualid.isna())&
              ((dfr_fp.progres_coalocationlevel1name.isna())|
               (dfr_fp.progres_coalocationlevel1name == "-"))].progres_registrationgroupid.tolist()
    r_dftemp = dfr[dfr.progres_registrationgroupid.isin(rids)][r_qc_cols]
    name_of_irr = 'The Country of Asylum Level 1 is missing for focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coalocationlevel1name is null or '-'"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coo_not_official():
    '''
    irregularity if focal point progres location level names don't match 
    1) either the names in OCHA OR 
    2) the names in the location level table in proGres v4 (matched against ocha pcodes)
    '''
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
                       (dfr_fp.progres_countryoforiginidname == "Syrian Arab Republic")&
                       (~dfr_fp.progres_coolocationlevel1name.isna())&
                       (dfr_fp.progres_coolocationlevel1name != '-')&
                       (
                        (~dfr_fp.progres_coolocationlevel1name.isin(list(set(coo_names_1 + coo_names_1_ocha)))) |
                        (~dfr_fp.progres_coolocationlevel2name.isin(list(set(coo_names_2 + coo_names_2_ocha)))) |
                        (~dfr_fp.progres_coolocationlevel3name.isin(list(set(coo_names_3 + coo_names_3_ocha)))) 
                       )
                      ][r_qc_cols]
    name_of_irr = 'Focal Point COO address does not match official list (COO=Syria only)'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coolocationlevelname does not match official admin names at levels 1 2 or 3"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coa_not_official():
    '''
    irregularity if focal point progres location level names don't match 
    1) either the names in OCHA OR 
    2) the names in the location level table in proGres v4 (matched against ocha pcodes)
    '''
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
                       (~dfr_fp.progres_coalocationlevel1name.isna())&
                       (dfr_fp.progres_coalocationlevel1name != '-') &
                       (
                           (~dfr_fp.progres_coalocationlevel1name.isin(list(set(coa_names_1 + coa_names_1_ocha)))) |
                           (~dfr_fp.progres_coalocationlevel2name.isin(list(set(coa_names_2 + coa_names_2_ocha)))) |
                           (~dfr_fp.progres_coalocationlevel3name.isin(list(set(coa_names_3 + coa_names_3_ocha)))) 
                       )
                     ][r_qc_cols]
    name_of_irr = 'Focal Point COA address does not match official list'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coalocationlevelname does not match official admin names at levels 1 2 or 3"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coo_2_3():
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
       (((dfr_fp.progres_coolocationlevel2name.isna())
         |(dfr_fp.progres_coolocationlevel2name == '-'))&
         (~dfr_fp.progres_coolocationlevel3name.isna()))][r_qc_cols]
    name_of_irr = 'COO Level 2 missing while Level 3 exists for focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coolocationlevel2name is missing but progres_coolocationlevel3name exists"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coo_1_2():
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
       (((dfr_fp.progres_coolocationlevel1name.isna())
         |(dfr_fp.progres_coolocationlevel1name == '-'))&
         (~dfr_fp.progres_coolocationlevel2name.isna()))][r_qc_cols]
    name_of_irr = 'COO Level 1 missing while Level 2 exists for focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coolocationlevel1name is missing but progres_coolocationlevel2name exists"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coa_2_3():
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
       (((dfr_fp.progres_coalocationlevel2name.isna())
         |(dfr_fp.progres_coalocationlevel2name == '-'))&
         (~dfr_fp.progres_coalocationlevel3name.isna()))][r_qc_cols]
    name_of_irr = 'COA Level 2 missing while Level 3 exists for focal point'
    r_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coalocationlevel2name is missing but progres_coalocationlevel3name exists"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

def fp_coa_1_2():
    r_dftemp = dfr_fp[(~dfr_fp.progres_individualid.isna())&
       (((dfr_fp.progres_coalocationlevel1name.isna())
         |(dfr_fp.progres_coalocationlevel1name == '-'))&
         (~dfr_fp.progres_coalocationlevel2name.isna()))][r_qc_cols]
    name_of_irr = 'COA Level 1 missing while Level 2 exists for focal point'
    r_dftemp["Irregularity"] = name_of_irr

    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "registration table merged, on progres_registrationgroupid, with individual records of focal points only"
    temp_indicators.loc[1,"Criteria"] = "progres_coalocationlevel1name is missing but progres_coalocationlevel2name exists"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return r_dftemp, temp_indicators

In [None]:
# List of function names corresponding to irregularities we want to check for:

reg_indicators_functions = [closed_reg_w_active_ind,
                            active_rg_wo_active_ind,
                            rg_size_mismatch,
                            reg_date_future,
                            rg_wo_active_fp,
                            rg_w_more_fp,
                            fp_missing_phonenumber,
                            fp_coo_1_missing,
                            fp_coa_1_missing,
                            fp_coo_not_official,
                            fp_coa_not_official,
                            fp_coo_2_3,
                            fp_coo_1_2,
                            fp_coa_2_3,
                            fp_coa_1_2,
                           ]    
# Loop through list of functions and save outputs in df table and indicators_checked table: 
rdf = pd.DataFrame()
for funcs in reg_indicators_functions :
    func_returns = funcs()
    
    r_dftemp = func_returns[0]
    rdf = pd.concat([rdf, r_dftemp])
    
    temp_indicators = func_returns[1]
    indicators_checked = pd.concat([indicators_checked, temp_indicators])

In [None]:
rdf.Irregularity.value_counts()

### To find cases with this Irregularity

In [None]:
irregularity_name = "Active Registration Group with No Active Individual"
irregularity_table = rdf # df
search_table = dfr # dfi
id_needed = "progres_registrationgroupid" # "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head(1)

In [None]:
irregularity_name = "Registration group has more than one focal point"
irregularity_table = rdf # df
search_table = dfr # dfi
id_needed = "progres_registrationgroupid" # "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head(1)

In [None]:
rdf[~rdf.duplicated(keep="first") == True].shape

In [None]:
rdf.shape

In [None]:
rdf[rdf.duplicated(keep=False) == True].shape

## 2.3. Address Inconsistencies

In [None]:
fp_address = pd.merge(dfi_fp, dfa, how="left", on="progres_individualid")

In [None]:
dfi_fp.shape

In [None]:
fp_address.shape

In [None]:
'''
    Each of the functions below takes as 
    input: pandas DataFrame (relevant table from proGres needed to check for given inconsistency)
    and 
    outputs: 
    (1) a_dftemp (pandas DataFrame) that identifies registration ids (progres_registrationgroupid) associated with records that have the relevant irregularity (Irregularity)
    (2) temp_indicators, which lists the indicator name along with additional information about the indicator, such as:
    - proGres table: string which describes the proGres table(s) from which records were searched for the relevant irregularity 
    - Criteria: string which describes the criteria applied to filter records for this irregularlity
    - Grouping: The overall grouping/category name that would characterize the type of irregularity
    
    Also Note: [r_qc_cols] specified in section 1.1. lists columns from the relevant proGres table needed to construct the summary tables needed for the PowerBi dashboard
'''


def fp_coa_match_w_current():
    a_dftemp = fp_address[(fp_address.progres_addresstypename == "Country or territory of asylum - current")&
          (fp_address.progres_countryidname !=  fp_address.progres_countryofasylumidname)][r_qc_cols]
    name_of_irr =  "COA for focal point does not match current COA in address record"
    a_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual records of focal points merged with address table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "COA name mismatch between individual table and address table"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return a_dftemp, temp_indicators

def fp_coa_1_match_current():
    a_dftemp = fp_address[(fp_address.progres_addresstypename == "Country or territory of asylum - current")&
                        (fp_address.progres_coalocationlevel1name != fp_address.progres_locationlevel1idname)&
                        (~fp_address.progres_locationlevel1idname.isna())&
                        (fp_address.progres_locationlevel1idname != '-')][r_qc_cols]
    name_of_irr =  "COA Level 1 for focal point does not match current COA Level 1 in address table"
    a_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual records of focal points merged with address table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "COA progres_locationlevel1idname mismatch between individual table and address table"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return a_dftemp, temp_indicators

def fp_current_cor():
    a_dftemp = fp_address[fp_address.progres_addresstypename == "Country of Residence - Current"][r_qc_cols]
    name_of_irr =  "Focal Point with address type Country of Residence - Current"
    a_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual records of focal points merged with address table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "COA progres_locationlevel1idname mismatch between individual table and address table"
    temp_indicators.loc[1,"Grouping"] = "address issue"
    
    return a_dftemp, temp_indicators

In [None]:
# List of function names corresponding to irregularities we want to check for:
add_indicators_functions = [fp_coa_match_w_current,
                            # fp_coo_match_w_recent,
                            # fp_coa_1_official,
                            fp_coa_1_match_current,
                            # fp_coo_1_match_recent,
                            fp_current_cor
                           ]    
# Loop through list of functions and save outputs in df table and indicators_checked table: 
adf = pd.DataFrame()
for funcs in add_indicators_functions :
    func_returns = funcs()
    
    a_dftemp = func_returns[0]
    adf = pd.concat([adf, a_dftemp])
    
    temp_indicators = func_returns[1]
    indicators_checked = pd.concat([indicators_checked, temp_indicators])

In [None]:
adf.Irregularity.value_counts()

In [None]:
adf[~adf.duplicated(keep="first") == True].shape

In [None]:
adf.shape

In [None]:
adf[adf.duplicated(keep=False) == True].shape

### To find cases with this Irregularity

In [None]:
irregularity_name = "COO for focal point does not match current COO in address record"
irregularity_table = adf # df
search_table = dfr # dfi
id_needed = "progres_registrationgroupid" # "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head()

In [None]:
irregularity_name = "Focal Point with address type Country of Residence - Current"
irregularity_table = adf # df
search_table = dfr # dfi
id_needed = "progres_registrationgroupid" # "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head()

In [None]:
dfi[dfi.progres_individualid == "AC7E814A-BB9A-EC11-811F-00155D5CC4A0"]

In [None]:
dfa[dfa.progres_individualid == "AC7E814A-BB9A-EC11-811F-00155D5CC4A0"]

## 2.4. Document Inconsistencies

In [None]:
# List of ID documents we will check for
# Some operations use vaccination card / book or birth book as form of ID, but these were not found in the list of documents
list_of_id_docs = [
                 'Birth notification (no event)',
                 'Birth certificate (no event)',
                 'National passport (no event)',
                 'National identity card (no event)',
                 'Family book'
                 # 'Identity document for asylum-seeker (issued by UNHCR)',
                 # 'Refugee identity document (no event)',
                 # 'Identity document for refugee (issued by UNHCR)',
                 # 'Other identity document (no event)',
                 # 'ID Card',
                 # 'Refugee certificate (no event)',
                 # 'Identity document for asylum-seeker (issued by government)',
                 # 'UNHCR ID card (no event)',
                 # 'Identity document for refugee (issued by government)',
                 # 'Camp identity card (no event)',
                 # 'Identity document for stateless person (issued by government)'
                                  ]

In [None]:
# List of documents we will check for
documents_of_interest = ["Asylum seeker certificate (no event)", 
                         "Proof of Registration",
                         "Refugee certificate (no event)",
                         # "Refugee certificate issuance event",
                         "Identity document for asylum-seeker (issued by UNHCR)", 
                         "Identity document for refugee (issued by UNHCR)",
                         "UNHCR ID card (no event)"]

In [None]:
# Keep documents of interes and filter out entries that are not "Valid"
dfd_interest = dfd[(dfd.progres_documenttypeidname.isin(documents_of_interest))&
                  (dfd.progres_documentstatusname  == "Valid")]

dfd_interest.progres_documenttypeidname.value_counts(dropna=False)

In [None]:
dfd_interest.head(1)

In [None]:
ind_with_multiple_unique_documents = dfd_interest.groupby(["progres_individualid", 
                                                        "progres_documenttypeidname"])\
                                         .progres_documentstatus.count().reset_index()\
                                         .sort_values(by="progres_documentstatus", ascending=False)
ind_with_multiple_unique_documents.head()

In [None]:
dfd_interest[dfd_interest.progres_individualid == "4BC58859-F6E8-EB11-812E-001DD8B71FE3"]

In [None]:
'''
    Each of the functions below takes as 
    input: pandas DataFrame (relevant table from proGres needed to check for given inconsistency)
    and 
    outputs: 
    (1) d_dftemp (pandas DataFrame) that identifies individual ids (progres_individualid) associated with records that have the relevant irregularity (Irregularity)
    (2) temp_indicators, which lists the indicator name along with additional information about the indicator, such as:
    - proGres table: string which describes the proGres table(s) from which records were searched for the relevant irregularity 
    - Criteria: string which describes the criteria applied to filter records for this irregularlity
    - Grouping: The overall grouping/category name that would characterize the type of irregularity
    
    Also Note: [qc_cols] specified in section 1.1. lists columns from the relevant proGres table needed to construct the summary tables needed for the PowerBi dashboard
'''

def child_wo_id_birth():
    id_doc = pd.merge(dfi[dfi.progres_age<=5], dfd[dfd.progres_documenttypeidname.isin(list_of_id_docs)],  how="left", on="progres_individualid")
    infants_without_id = list(set(id_doc[id_doc.progres_documenttypeidname.isna()].progres_individualid.tolist()))
    d_dftemp = dfi[dfi.progres_individualid.isin(infants_without_id)][qc_cols]
    name_of_irr = "Child(<=5 years old) without ID/birth document"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table filtered for children merged with document table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Child (progres_age<5) missing progres_documenttypeidname of types: birth notification, birth certificate, passport, identity card or family book"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

def ind_missing_docs():
    # join individual table to documents of interest
    ind_wo_docs = pd.merge(dfi, dfd_interest, how="left", on="progres_individualid")
    # individual records with no match found in the df with documents of interest
    ind_wo_docs = ind_wo_docs[ind_wo_docs.progres_documenttypeidname.isna()]
    d_dftemp = dfi[dfi.progres_individualid.isin(ind_wo_docs.progres_individualid.tolist())][qc_cols]
    name_of_irr =  "Individual without UNHCR-issued documents"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals missing progres_documenttypeidname issued by UNHCR (see documents_of_interest in script for full list)"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators
    
def ind_2_docs():
    # use ind_with_multiple_documents below if we want to check for individuals with multiple documents
    ind_with_multiple_documents = dfd_interest\
                                    .progres_individualid\
                                    .value_counts()\
                                    .reset_index()
    ind_with_multiple_documents = ind_with_multiple_documents[ind_with_multiple_documents.progres_individualid>1]["index"].tolist()
    d_dftemp = dfi[dfi.progres_individualid.isin(ind_with_multiple_documents)][qc_cols]
    
    # # use ind_with_multiple_unique_documents below if instead we want to check for individuals with multiple documents of a particular type / the same type
    # ind_with_multiple_unique_documents = dfd_interest.groupby(["progres_individualid", 
    #                                                     "progres_documenttypeidname"])\
    #                                      .progres_documentstatus.count().reset_index()\
    #                                      .sort_values(by="progres_documentstatus", ascending=False)
    # ind_with_multiple_unique_documents = ind_with_multiple_unique_documents[ind_with_multiple_unique_documents.progres_documentstatus>1]\
    #                                     .progres_individualid.unique().tolist()
    
    d_dftemp = dfi[dfi.progres_individualid.isin(ind_with_multiple_unique_documents)][qc_cols]
    name_of_irr =  "Individual has two or more active and valid UNHCR-issued documents"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals with multiple valid documents issued by UNHCR (see documents_of_interest in script for full list)"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

def doc_issue_date():
    d_dftemp = dfi[dfi.progres_individualid.isin(list
                                                 (set(dfd_interest[dfd_interest.progres_dateofissue.isna()]\
                                                      .progres_individualid.tolist())))][qc_cols]
    name_of_irr =  "Individual has UNHCR document with no issue date"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals with UNHCR document where progres_dateofissue is null"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

def doc_expiry_date():
    d_dftemp = dfi[dfi.progres_individualid.isin(list
                                                 (set(dfd_interest[dfd_interest.progres_dateofexpiry.isna()]\
                                                      .progres_individualid.tolist())))][qc_cols]
    name_of_irr = "Individual has UNHCR document with no expiry date"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals with UNHCR document where progres_dateofexpiry is null"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

def doc_issued_future():
    d_dftemp = dfi[dfi.progres_individualid.isin(list
                                                 (set(dfd_interest[dfd_interest.progres_dateofissue > date_extracted]\
                                                      .progres_individualid.tolist())))][qc_cols]
    name_of_irr =  "Individual has UNHCR document with issue date in the future"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals with UNHCR document where progres_dateofissue is later than date extracted"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

def doc_exp_issue_date():
    d_dftemp = dfi[dfi.progres_individualid.isin(list
                                                 (set(dfd_interest[dfd_interest.progres_dateofissue > dfd_interest.progres_dateofexpiry]\
                                                      .progres_individualid.tolist())))][qc_cols]
    name_of_irr =  "Individual has UNHCR document with expiry date prior to issue date"
    d_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with document table (filtered for UNHCR documents) on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individuals with UNHCR document where progres_dateofexpiry is earlier than progres_dateofissue"
    temp_indicators.loc[1,"Grouping"] = "document issue"
    
    return d_dftemp, temp_indicators

In [None]:
# List of function names corresponding to irregularities we want to check for:
doc_indicators_functions = [child_wo_id_birth,
                            ind_missing_docs,
                            ind_2_docs,
                            doc_issue_date,
                            doc_expiry_date,
                            doc_issued_future,
                            doc_exp_issue_date
                           ]    
# Loop through list of functions and save outputs in df table and indicators_checked table: 
ddf = pd.DataFrame()
for funcs in doc_indicators_functions :
    func_returns = funcs()
    
    d_dftemp = func_returns[0]
    ddf = pd.concat([ddf, d_dftemp])
    
    temp_indicators = func_returns[1]
    indicators_checked = pd.concat([indicators_checked, temp_indicators])

In [None]:
ddf.Irregularity.value_counts()

In [None]:
ddf[~ddf.duplicated(keep="first") == True].shape

In [None]:
ddf.shape

In [None]:
ddf[ddf.duplicated(keep=False) == True].shape

### To find cases with this Irregularity

In [None]:
irregularity_name = "Individual without UNHCR-issued documents"
irregularity_table = ddf # df
search_table = dfi
id_needed = "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
issues.head()

In [None]:
irregularity_name = "Child(<=5 years old) without ID/birth document"
irregularity_table = ddf # df
search_table = dfi
id_needed = "progres_individualid"

issues = search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())]
# issues[issues.progres_arrivaldate<issues.progres_dateofbirth][["progres_arrivaldate", "progres_dateofbirth"]]
issues.head()

In [None]:
dfd[dfd.progres_individualid == "AD49E28E-E1E9-EB11-812E-001DD8B71FE3"]

## 2.5. Specific Needs Inconsistencies

### All Specific Needs: Input table needed for Specific Needs Crosstab in PowerBi:

In [None]:
# inner join because not all listed spns had respective entries in the individual table
all_sn = pd.merge(dfs, dfi, on="progres_individualid", how="inner")
all_sn = all_sn[~all_sn.duplicated(["progres_individualid","progres_spncategory2name"],keep="first")]
all_sn["Irregularity"] = all_sn["progres_spncategory2name"]
all_sn["SpecificNeeds"] = True

spn_input_for_crosstab = all_sn.groupby(["progres_spncategory2name",
                                        "progres_agecohortname",
                                         "progres_sexname"
                                        ],
                                       dropna = False).progres_individualid.count().reset_index()

all_sn = all_sn[qc_cols + ["progres_spncategory2name",
                          "progres_agecohortname",
                          "progres_sexname"]]

In [None]:
# This is what the visual would look like in PowerBi
pd.crosstab(all_sn["progres_spncategory2name"],
            [all_sn["progres_agecohortname"], all_sn["progres_sexname"]], 
            margins=True, 
            margins_name = "Total",
           normalize=False)

### Create input tables to be used for specific needs table quality checks

In [None]:
# this table merges the individual table with specific needs table 
# so specific needs can be cross-checked againt demographic characteristics
df_i_s = pd.merge(dfi, dfs, on="progres_individualid", how="outer")

# Extract year and month of birth from progres_dateofbirth field
df_i_s["yob"] = df_i_s["progres_dateofbirth"].apply(lambda x: x.year)
df_i_s["mob"] = df_i_s["progres_dateofbirth"].apply(lambda x: x.month)

### SPN Functions

In [None]:
'''
    Each of the functions below takes as 
    input: pandas DataFrame (relevant table from proGres needed to check for given inconsistency)
    and 
    outputs: 
    (1) sp_dftemp (pandas DataFrame) that identifies individual ids (progres_individualid) associated with records that have the relevant irregularity (Irregularity)
    (2) temp_indicators, which lists the indicator name along with additional information about the indicator, such as:
    - proGres table: string which describes the proGres table(s) from which records were searched for the relevant irregularity 
    - Criteria: string which describes the criteria applied to filter records for this irregularlity
    - Grouping: The overall grouping/category name that would characterize the type of irregularity
    
    Also Note: [qc_cols] specified in section 1.1. lists columns from the relevant proGres table needed to construct the summary tables needed for the PowerBi dashboard
'''

def child_marital_status():
    sp_dftemp = df_i_s[(df_i_s.progres_age < 10)&
          (df_i_s.progres_maritalstatusname.isin(['Married',
                                                     # 'Divorced',
                                                     # 'Widowed',
                                                     # 'Separated',
                                                     'Partnership',
                                                     'Common Law Married']))&
          ((df_i_s.progres_spncategory2name != "Child at risk")|
           (df_i_s.progres_spncategory2name.isna()))][qc_cols]
    name_of_irr = 'Child (<10 years old) with Marital Status Name and No Specific Needs'
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "individual with age <10 and marital status (married, partnership, or common law married) without the Child at Risk tag"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def older_person():
    sp_dftemp = df_i_s[(df_i_s.progres_spncategory2name == 'Older person at risk')&
       (df_i_s.progres_age < 60)][qc_cols]
    name_of_irr = "Older person at risk who is younger than 60"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Older person at risk tag for individual with progres_age<60"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def male_woman_risk():
    sp_dftemp = df_i_s[(df_i_s.progres_spncategory2name == 'Woman at risk')&
           (df_i_s.progres_sexname == "Male")][qc_cols]
    name_of_irr = "Male individual with Woman At Risk specific need"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Woman at risk tag for individual with progres_sex=Male"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def child_single_parent():
    sp_dftemp = df_i_s[(df_i_s.progres_spncategory2name == 'Single parent')&
                   (df_i_s.progres_age<=11)][qc_cols]
    name_of_irr = "Child (age<=11) tagged as Single Parent"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Single parent tag for child of age <=11"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def child_risk_18():
    sp_dftemp = df_i_s[(df_i_s.progres_age>=18)&
                       (df_i_s.progres_spncategory2name == "Child at risk")][qc_cols]
    name_of_irr =  "Child at risk older than or equal to age 18"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Child at risk tag for individual with progres_age>=18"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def usc_18_age():
    sp_dftemp = df_i_s[(df_i_s.progres_spncategory2name == "Unaccompanied or separated child")&
                       (df_i_s.progres_age>=18)][qc_cols]
    name_of_irr = "Unaccompanied or separated child tag for individual age 18 or above"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "Unaccompanied or separated child tag for individual with progres_age>=18"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

def sgbv_child():
    sp_dftemp = df_i_s[(df_i_s.progres_spncategory2name == "SGBV")&
                       (df_i_s.progres_age<=4)][qc_cols]
    name_of_irr =  "SGBV case for child age <4"
    sp_dftemp["Irregularity"] = name_of_irr
    
    temp_indicators.loc[1,"Irregularity"] = name_of_irr
    temp_indicators.loc[1,"proGres_table"] = "individual table merged with specific needs table on progres_individualid"
    temp_indicators.loc[1,"Criteria"] = "SBGV tag for individual with progres_age<=4"
    temp_indicators.loc[1,"Grouping"] = "specific needs issue"
    
    return sp_dftemp, temp_indicators

In [None]:
# List of function names corresponding to irregularities we want to check for:
spn_indicators_functions = [child_marital_status,
                            older_person,
                            male_woman_risk,
                            child_single_parent,
                            child_risk_18,
                            usc_18_age,
                            sgbv_child
                            ]

# Loop through list of functions and save outputs in df table and indicators_checked table: 
sdf = pd.DataFrame()
for funcs in spn_indicators_functions :
    func_returns = funcs()
    
    sp_dftemp = func_returns[0] 
    sdf = pd.concat([sdf, sp_dftemp])
    
    temp_indicators = func_returns[1]
    indicators_checked = pd.concat([indicators_checked, temp_indicators])

In [None]:
sdf.Irregularity.value_counts()

In [None]:
sdf.shape

### To find cases with this Irregularity

In [None]:
irregularity_name = "Child (age<=11) tagged as Single Parent"
irregularity_table = sdf
search_table = dfi
id_needed = "progres_individualid"

search_table[search_table[id_needed].isin(irregularity_table[irregularity_table.Irregularity == irregularity_name][id_needed].tolist())].head()

### Remove Duplicates

In [None]:
# sdf[~sdf.duplicated(keep="first")].shape

# sdf[sdf.duplicated(keep="first")].shape

# sdf[~sdf.duplicated(keep=False)].shape

# sdf[sdf.duplicated(keep=False)].shape

In [None]:
sdf[~sdf.duplicated(keep="first")].Irregularity.value_counts()

In [None]:
sdf[sdf.duplicated(keep="first")].Irregularity.value_counts()

In [None]:
sdf[(sdf.duplicated(keep=False))&
   (sdf.Irregularity == "Older person at risk who is younger than 60")].groupby("progres_individualid").Irregularity.value_counts()

In [None]:
# We can also remove it for idf in the next step
sdf = sdf[~sdf.duplicated(keep="first")]

In [None]:
sdf.shape

In [None]:
sdf.Irregularity.value_counts()

# Section 3.: Calculate shares and combine ALL Inconsistencies 

## 3.1 Combine all Individual-level inconsistencies

In [None]:
# Combine table of id and irregularity for ind, doc, and specific needs
# Registration group and address irregularities (i.e. rdf and adf) will be combined later
# Since they do not have fields relating to the individual

# Combine irregularities found at the individual level
idf = pd.concat([df, 
                 ddf, 
                 sdf
                ])

# check for / remove any duplicates
idf = idf[~idf.duplicated(keep="first")]

# append additional columns with demographic characteristics of the individual that we need for the summary table
idf = pd.merge(idf, dfi[qc_cols_append], how="left", on="progres_individualid")

####  Re-format data in fields needed for the summary table
idf.loc[:,"QC_Group"] = "Individual"
idf.loc[:,"GUID"] = idf["progres_individualid"]
idf.loc[:,"ID_Number"] = idf["progres_id"]
idf.loc[:, "Source_DQ"] = np.where( ( (idf["createdbyname"] == "proGres v4 Migration Account 01") | 
                                      (idf["createdbyname"] == "sa-pro4-PROD-Azure Service Account" ) 
                                     # |
                                     #  (idf["modifiedbyname"] == "proGres v4 Migration Account 01") |
                                     #  (idf["modifiedbyname"] == "proGres Production")
                                    ) , "Migration", "User")

idf.loc[:,"COO"] = idf["progres_countryoforiginidname"]
idf["YearCreated"] = idf["createdon"].apply(lambda x: x.year)
idf["YearModified"] = idf["modifiedon"].apply(lambda x: x.year)
idf["YearArrival"] = idf["progres_arrivaldate"].apply(lambda x: x.year)
idf["YearRegistered"] = idf["progres_registrationdate"].apply(lambda x: x.year)
idf["COA"] = idf["progres_businessunitname"].apply(lambda x: x.split('-')[0][:-1])
idf["COA"] = idf["COA"].apply(lambda x: "GCC except UAE" if x == "RO Riyad" else x)
idf["YearOfBirth"] = idf["progres_dateofbirth"].apply(lambda x: x.year)
idf["SpecificNeeds"] = None

# clean up df
idf = idf.reset_index()
idf = idf.drop(columns=["index"])

#### Individual level irregularities: Create summary tables that feed into Power BI

##### 1) Declare demographic / biographical characteristics that are of interest

In [None]:
cols_to_groupby = [
                  "COA",
                  "YearCreated",
                  "YearModified",
                  "YearArrival",
                  "YearRegistered",
                  "progres_refugeestatusname",
                  "progres_sexname",
                  "progres_agecohortname",
                  "COO",
                  "Source_DQ", 
                  "QC_Group", 
                  "SpecificNeeds"
                    ]

##### 2) Get count of unique individuals with irregularity per demographic characteristics

In [None]:
# isummary_unique = idf[idf.SpecificNeeds.isna()].groupby(cols_to_groupby[:-1], dropna=False).progres_individualid.nunique().reset_index().rename(columns={"progres_individualid" : "UniqueCasesByCountry"})
isummary_unique = idf[idf.SpecificNeeds.isna()].groupby(cols_to_groupby, 
                                                        dropna=False).progres_individualid\
                                                                     .nunique()\
                                                                     .reset_index()\
                                                                     .rename(columns={"progres_individualid" : "UniqueCasesByCountry"})
isummary_unique = isummary_unique.drop(columns=["SpecificNeeds"])


In [None]:
isummary_unique.UniqueCasesByCountry.sum() == idf[idf.SpecificNeeds.isna()].progres_individualid.nunique()

##### 3) Get number of irregularities found per demographic/biographical characteristic

In [None]:
isummary = idf.copy()
isummary = isummary.groupby(cols_to_groupby+["Irregularity"], dropna=False).progres_individualid.count().reset_index()#.sort_values(by="Irregularity", ascending=False)
isummary = isummary.rename(columns={"progres_individualid":"NumberOfCases"})
# Joined on everything except the column "SpecificNeeds"
isummary = pd.merge(isummary, isummary_unique, how="inner", on=cols_to_groupby[:-1])
isummary.rename(columns={"progres_sexname" : "Sex",
                         "progres_agecohortname" : "AgeCohort",
                         "progres_refugeestatusname" : "RefugeeStatus"
                          }, 
                          inplace=True)
isummary["TotalCasesByCountry"] = 0

In [None]:
isummary.head(1)

##### 4) Get total individuals screened per demographic/biographical characteristic

In [None]:
ishare = dfi.copy()
ishare["YearCreated"] = ishare["createdon"].apply(lambda x: x.year)
ishare["YearModified"] = ishare["modifiedon"].apply(lambda x: x.year)
ishare["YearArrival"] = ishare["progres_arrivaldate"].apply(lambda x: x.year)
ishare["COA"] = ishare["progres_businessunitname"].apply(lambda x: x.split('-')[0][:-1])
ishare["COA"] = ishare["COA"].apply(lambda x: "GCC except UAE" if x == "RO Riyad" else x)
ishare["COO"] = ishare["progres_countryoforiginidname"]
ishare["YearRegistered"] = ishare["progres_registrationdate"].apply(lambda x: x.year)
ishare_totals = ishare.groupby(cols_to_groupby[:-3],dropna=False).progres_individualid.nunique().reset_index().rename(columns={"progres_individualid":"TotalCasesByCountry"})
ishare_totals["QC_Group"] = "Individual"
ishare_totals.head()

In [None]:
ishare_totals.TotalCasesByCountry.sum()

In [None]:
dfi.shape

In [None]:
isummary[isummary.SpecificNeeds.isna()].groupby("Irregularity").NumberOfCases.sum()

In [None]:
isummary[~isummary.SpecificNeeds.isna()].groupby("Irregularity").NumberOfCases.sum()

##### 5) Get detailed table of individuals along with their inconsistencies and append inconsistencies count + list (previously named ianon)

In [None]:
i_detailed = idf.copy()
i_detailed = i_detailed.drop(columns=[
                           'progres_registrationdate',
                           'progres_arrivaldate', 
                           'createdon', 
                           'modifiedon', 
                           #'progres_spncategory2name', 
                           'progres_businessunitname', 
                           'progres_countryoforiginidname',
                           'progres_countryofasylumidname', 
                           'progres_dateofbirth', 
                           'progres_age',
                           'createdbyname', 
                           'modifiedbyname'])
i_detailed['GUID'] = i_detailed.progres_individualid#.map(hash)
i_detailed['ID_Number'] = i_detailed.progres_id#.map(hash)
i_detailed.head(1)

In [None]:
# Get the number of irregularities associated with given individual ID
Inconsistencies_Count = i_detailed.groupby(["GUID", "ID_Number"]).Irregularity.count().reset_index()
Inconsistencies_Count = Inconsistencies_Count.rename(columns={"Irregularity" : "Irregularity_Count"})
Inconsistencies_Count.head()

In [None]:
# Get the list of irregularities associated with given individual ID
Inconsistencies_List = i_detailed.groupby(["GUID", "ID_Number"]).Irregularity.apply(lambda x: ' ; '.join(map(str,x))).reset_index()
Inconsistencies_List = Inconsistencies_List[["GUID", "ID_Number", "Irregularity"]].drop_duplicates()
Inconsistencies_List = Inconsistencies_List.rename(columns={"Irregularity" : "Irregularity_List"})
Inconsistencies_List.head()

In [None]:
# append both the count of irregularities and list of irregularities to the full list of individual ids
i_detailed = pd.merge(i_detailed, Inconsistencies_Count, how="left", on=["GUID", "ID_Number"])
i_detailed = pd.merge(i_detailed, Inconsistencies_List, how="left", on=["GUID", "ID_Number"])
i_detailed.shape

In [None]:
i_detailed.head(1)

## 3.2 Combine all Registration Group inconsistencies

In [None]:
# Check
rdf[~rdf.duplicated(keep="first")].shape == rdf.shape

In [None]:
# Combine all case-level irregularities- i.e. those related to registration (rdf) and addresses (adf)
rdf = pd.concat([rdf, adf])

In [None]:
# append additional columns with demographic characteristics of the individual that we need for the summary table
rdf = pd.merge(rdf, dfr_fp[r_qc_cols_append], how="left", on="progres_registrationgroupid")

####  Re-format data in fields needed for the summary table
rdf = rdf.rename(columns={"progres_countryoforiginidname" : "COO"})
rdf["COA"] = rdf["progres_businessunitname"].apply(lambda x: x.split('-')[0][:-1])
rdf["COA"] = rdf["COA"].apply(lambda x: "GCC except UAE" if x == "RO Riyad" else x)
rdf.loc[:, "Source_DQ"] = np.where( ( (rdf["createdbyname"] == "proGres v4 Migration Account 01") | 
                                      (rdf["createdbyname"] == "sa-pro4-PROD-Azure Service Account" )
                                    ) , "Migration", "User")
rdf["YearCreated"] = rdf["createdon"].apply(lambda x: x.year)
rdf["YearModified"] = rdf["modifiedon"].apply(lambda x: x.year)
rdf["YearRegistered"] = rdf["progres_registrationdate"].apply(lambda x: x.year)
rdf["QC_Group"] = "RegistrationGroup"
rdf["GUID"] = rdf["progres_registrationgroupid"]
rdf["ID_Number"] = rdf["progres_registrationgroupbusinessid"]
rdf.reset_index(inplace=True)
rdf = rdf.drop(columns=["index"])
# remove duplicates
rdf = rdf[~rdf.duplicated(keep="first")]
rdf.columns

#### Case-level irregularities: Create summary tables to feed into Power BI

##### 1) Declare demographic / biographical characteristics that are of interest

In [None]:
r_cols_to_groupby = [
                  "COA",
                  "COO",
                  "YearCreated",
                  "YearModified",
                  "YearRegistered",
                  "Source_DQ", 
                  "QC_Group"
                    ]

##### 2) Get count of unique cases with irregularity per demographic characteristics

In [None]:
# ["COA","COO","YearRegistered", "YearCreated","YearModified","Source_DQ","QC_Group"]
rsummary_unique = rdf.groupby(r_cols_to_groupby, dropna=False).progres_registrationgroupid.nunique().reset_index().rename(columns={"progres_registrationgroupid" : "UniqueCasesByCountry"})
rsummary_unique.head()

##### 3) Get number of irregularities found per demographic/biographical characteristic

In [None]:
rsummary = rdf.copy()
rsummary = rsummary.groupby(r_cols_to_groupby+["Irregularity"], 
                            dropna=False).progres_registrationgroupid.count().reset_index()
rsummary = rsummary.rename(columns={"progres_registrationgroupid":"NumberOfCases"})
rsummary = pd.merge(rsummary, rsummary_unique, how="left", on=r_cols_to_groupby)
rsummary["TotalCasesByCountry"] = 0
rsummary.shape

##### 4) Get total cases screened per demographic/biographical characteristic

In [None]:
dfr_fp.shape

In [None]:
dfr_fp[~dfr_fp.duplicated(["progres_registrationgroupid"],keep='first')].shape

In [None]:
dfr_fp[dfr_fp.duplicated(["progres_registrationgroupid"],keep='first')].shape

In [None]:
# to remove any observations that might have two focal points, note that this will keep biographical information from one of the two focal points
dfr_fp = dfr_fp[~dfr_fp.duplicated(["progres_registrationgroupid"],keep='first')]

In [None]:
rshare = dfr_fp.copy()

In [None]:
rshare["YearRegistered"] = rshare["progres_registrationdate"].apply(lambda x: x.year)
rshare["COA"] = rshare["progres_businessunitname"].apply(lambda x: x.split('-')[0][:-1])
rshare["COO"] = rshare["progres_countryoforiginidname"]
rshare["COA"] = rshare["COA"].apply(lambda x: "GCC except UAE" if x == "RO Riyad" else x)
rshare["YearCreated"] = rshare["createdon"].apply(lambda x: x.year)
rshare["YearModified"] = rshare["modifiedon"].apply(lambda x: x.year)

In [None]:
r_cols_to_groupby

In [None]:
rshare_totals = rshare.groupby(r_cols_to_groupby[:-2], dropna=False).progres_registrationgroupid.nunique().reset_index().rename(columns={"progres_registrationgroupid":"TotalCasesByCountry"})
rshare_totals.head()

In [None]:
rshare_totals["QC_Group"] = "RegistrationGroup"

In [None]:
rshare_totals.TotalCasesByCountry.sum()

##### 5) Get detailed table of cases along with their inconsistencies and append inconsistencies count + list (previously named ianon)

In [None]:
rdf.head(1)

In [None]:
rdf.columns

In [None]:
r_detailed = rdf.copy()
r_detailed["GUID"] = r_detailed.progres_registrationgroupid#.map(hash)
r_detailed["ID_Number"] = r_detailed.progres_registrationgroupbusinessid#.map(hash)
r_detailed = r_detailed.drop(columns = ["progres_registrationgroupid", 
                                        "createdon", 
                                        "modifiedon",
                                        'progres_registrationdate', 
                                        'progres_registrationgroupbusinessid', 
                                        'progres_businessunitname',
                                        'progres_countryofasylumidname', 
                                        'createdbyname', 
                                        'modifiedbyname'])

In [None]:
# Get the number of irregularities associated with given registration group id
Inconsistencies_Count = r_detailed.groupby(["GUID", "ID_Number"]).Irregularity.count().reset_index()
Inconsistencies_Count = Inconsistencies_Count.rename(columns={"Irregularity" : "Irregularity_Count"})
Inconsistencies_Count.head()

In [None]:
# Get list of irregularities associated with given registration group id
Inconsistencies_List = r_detailed.groupby(["GUID", "ID_Number"]).Irregularity.apply(lambda x: ' ; '.join(x)).reset_index()
Inconsistencies_List = Inconsistencies_List[["GUID", "ID_Number", "Irregularity"]].drop_duplicates()
Inconsistencies_List = Inconsistencies_List.rename(columns={"Irregularity" : "Irregularity_List"})
Inconsistencies_List.head()

In [None]:
# Append both the count of irregularities and list of irregularities to the full list of individual ids
r_detailed = pd.merge(r_detailed, Inconsistencies_Count, how="left", on=["GUID", "ID_Number"])
r_detailed = pd.merge(r_detailed, Inconsistencies_List, how="left", on=["GUID", "ID_Number"])
r_detailed.shape

In [None]:
r_detailed.head()

## 3.3 Combine individual and case-level tables

#### 1) Combine table outputs from section 3.1.4 and 3.2.4

In [None]:
total = pd.concat([ishare_totals, 
                   rshare_totals])
total.sample(5)

#### 2) Combine table outputs from 3.1.5 and 3.2.5

In [None]:
df_detailed = pd.concat([i_detailed, r_detailed])

df_detailed["progres_individualid"] = None
df_detailed["progres_registrationdate"] = None
df_detailed["progres_arrivaldate"] = None
df_detailed["createdon"] = None
df_detailed["modifiedon"] = None
df_detailed["progres_spncategory2name"] =None
df_detailed["progres_id"] = None
df_detailed["progres_businessunitname"] = None
df_detailed["progres_countryoforiginidname"] = None
df_detailed["progres_countryofasylumidname"] = None
df_detailed["progres_dateofbirth"] = None
df_detailed["progres_age"] = None
df_detailed["createdbyname"] = None
df_detailed["modifiedbyname"] = None
df_detailed["Column1"] = 0

df_detailed = df_detailed[df_detailed.SpecificNeeds.isna()]
df_detailed = df_detailed[["GUID", "ID_Number", "QC_Group", "Irregularity_Count", "Irregularity_List"]]
df_detailed = df_detailed[~df_detailed.duplicated(keep='first')]

#### 3) Combine table outputs from 3.1.3 and 3.2.3

In [None]:
# Summary table to feed into dashboard
final_df = pd.concat([isummary, rsummary]).reset_index()
final_df = final_df.drop(columns=["index"])
final_df["ScriptExecutionDate"] = datetime.datetime.now().date()
final_df["ExtractionDate"] = date_extracted.date()

# Append grouping information to summary so it also appears on the dashboard!
final_df = pd.merge(final_df, indicators_checked[["Irregularity", "Grouping"]], how="left", on="Irregularity")

##  3.4 Get List of Indicators Checked

In [None]:
indicators_checked = pd.merge(indicators_checked, 
                              final_df[final_df.SpecificNeeds.isna()].groupby(["Irregularity", 
                                                                               "QC_Group"]).NumberOfCases.sum().reset_index(),
                              on="Irregularity", 
                              how="outer")
indicators_checked["DateExtracted"] = str(date_extracted)
indicators_checked["ScriptVersion"] = script_version
indicators_checked.shape

In [None]:
indicators_checked.head(2)

# Section 4: Save to db in server

In [None]:
SERVER = ip_of_server
DATABASE = 'QualityCheck_Egypt'
DRIVER = 'SQL Server'
USERNAME = uap.split(',')[0]
PASSWORD = uap.split(',')[1]
DATABASE_CONNECTION = f'mssql://{USERNAME}:{PASSWORD}@{SERVER}/{DATABASE}?driver={DRIVER}'

engine = sqlalchemy.create_engine(DATABASE_CONNECTION)
connection = engine.connect()

In [None]:
## write the DataFrame to a table in the sql database
final_df.to_sql("Egypt_DQ_summary_ho", 
                engine, 
                # schema="dbo",
                if_exists='replace', # or append
                index=False
               )

In [None]:
total.to_sql("Egypt_Total_ho", 
                engine, 
                # schema="dbo",
                if_exists='replace', # or append
                index=False
               )

In [None]:
df_detailed.to_sql("Egypt_Ind_w_inconsistencies_ho", 
                engine, 
                # schema="dbo",
                if_exists='replace', # or append
                index=False
               )

In [None]:
indicators_checked.to_sql("Egypt_Indicators_Checked_ho", 
                engine, 
                # schema="dbo",
                if_exists='replace', # or append
                index=False
               )

In [None]:
spn_input_for_crosstab.to_sql("SPN_input_for_crosstab_in_pwrbi_ho", 
                              engine, 
                              # schema="dbo",
                              if_exists='replace',
                              index=False
                              )

# Section 5: To find cases with a particular Irregularity

#### Individuals with QC issus

In [None]:
deets = pd.read_sql_query("SELECT * FROM dbo.Egypt_Ind_w_inconsistencies",
                                  db_connection)

In [None]:
irregularity_name = "Individual with Date of Birth after Registration Date"
cols_of_interest_for_crosschecking = ["progres_id", "progres_indvidualid", "progres_dateofbirth", "progres_registrationdate"]
search_table = dfi
id_needed = "progres_individualid"
irregularity_table = deets

def get_record(irregularity_name, cols_of_interest_for_crosschecking, irregularity_table):
    ind_w_issue = irregularity_table.copy()
    ind_w_issue["issue"] = ind_w_issue.apply(lambda x: 1 if irregularity_name in x['Irregularity_List'] else 0, axis=1)
    ind_w_issue = ind_w_issue[ind_w_issue.issue==1].GUID.tolist()
    issues = search_table[search_table[id_needed].isin(ind_w_issue)]
    return issues[cols_of_interest_for_crosschecking].head()

In [None]:
irregularity_name = "Active Registration Group without active focal point"
cols_of_interest_for_crosschecking = ["progres_registrationgroupbusinessid", "progres_individualid"]
search_table = dfr_fp
id_needed = "progres_registrationgroupid"
irregularity_table = deets

def get_record(irregularity_name, cols_of_interest_for_crosschecking, irregularity_table):
    ind_w_issue = irregularity_table.copy()
    ind_w_issue["issue"] = ind_w_issue.apply(lambda x: 1 if irregularity_name in x['Irregularity_List'] else 0, axis=1)
    ind_w_issue = ind_w_issue[ind_w_issue.issue==1].GUID.tolist()
    issues = search_table[search_table[id_needed].isin(ind_w_issue)]
    return issues[cols_of_interest_for_crosschecking].head()

In [None]:
get_record(irregularity_name, cols_of_interest_for_crosschecking, irregularity_table)

# Section 6: APPENDIX

### Overall figures

In [None]:
ishare.shape

In [None]:
rshare.shape

In [None]:
idf[idf.SpecificNeeds.isna()].shape

In [None]:
rdf.shape

In [None]:
df_detailed[(df_detailed.QC_Group == "Individual")].shape

In [None]:
df_detailed[df_detailed.QC_Group == "RegistrationGroup"].shape

In [None]:
final_df[final_df.duplicated(keep='first')].shape