### Data and tools prep

In [None]:
import pandas as pd
import numpy as np

In [None]:
facility = pd.read_csv("./data/Health_Facilities_2019-07-03.csv")

In [None]:
facility

In [None]:
facility.columns

In [None]:
#'Sterilazation_and_Infection_Control' has multiple columns - some from infrastructure, others from service
#but pd.concat may have mixed up where the data ended up
#Ah maybe the fix is easy? the original col and .2(from infra), .1 and .3 (from service) seem to be the same?
facility[['Facility_ID', 'Sterilazation_and_Infection_Control', 'Sterilazation_and_Infection_Control.1',
         'Sterilazation_and_Infection_Control.2', 'Sterilazation_and_Infection_Control.3']]

#Yup they're repeating columns. Keeping the original and .1
all(facility['Sterilazation_and_Infection_Control'].fillna('NA') == facility['Sterilazation_and_Infection_Control.2'].fillna('NA'))
all(facility['Sterilazation_and_Infection_Control.1'].fillna('NA') == facility['Sterilazation_and_Infection_Control.3'].fillna('NA'))

### The Task: We want a *facility_services table* and *facility_infrastructure table*

In [None]:

fac_serv_wide = facility[['Facility_ID','General_Clinical_Services', 'Malaria_Diagnosis_and_Treatment',
       'TB_Diagnosis,Care_and_Treatment', 'Cardiouvascula_Care_and_Treatment',
       'HIV/AIDS_Prevention', 'HIV/AIDS_Care_and_Treatment', 'Therapeutics',
       'Prosthetics_and_Medical_Devices',
       'Health_Promotion_and_Disease_Prevention', 'Diagnosis_Serices',
       'Reproductive_&_Child_Health_Care_Serices',
       'Growth_Monitoring/Nutrition_&_Surveliance',
       'Oral_Health_Services(Dental_Serices)', 'ENT_Services',
       'Sterilazation_and_Infection_Control.1', 'Support_Services',
       'Emergency_Preparedness', 'Other_Services']]

#Keep this wide for now
facility_info = facility[['Facility_ID', 'Facility Name', 'Ward', 'Village/Street',
       'Facility Type', 'Operating Status', 'Ownership', 'Latitude',
       'Longitude', 'Facility_Zone', 'Facility_Region', 'Facility_District',
       'Facility_Council', 'Facility_Location', 'Common_Facility_Name',
       'Registration_ID', 'CTC_ID', 'MTUHA_Code', 'MSD_ID', 'Facility_Type',
       'Facility_Ownership', 'Facility_Operating_Status',
       'Facility_Registration_Status', 'Location_Description',
       'Waypoint_Number', 'Altitude(Meters)', 'Service_Areas(Villages)',
       'Catchment_Population', 'Date_Opened/Inaugurated/Upgraded', 'Year',
       'Postal_Address', 'Postal_Code', 'Official_Phone_#', 'Official_Fax_#',
       'Official_Email', 'Facility_In-Charge_Name',
       'Facility_In-Charge_Cadre', 'Website']]

In [None]:
#To reverse the melt, df.pivot (or just use the original DFs)
facility_services = pd.melt(fac_serv_wide, id_vars='Facility_ID', var_name="Type of Service", value_name="Services")

### Section 1: Creating a manual collection of services and adding delimiters to existing data

In [None]:
#This is a lot to process and the data structure still isn't right for the long version of the table
#It should be a service per row, but current rows can contain 4 or more services with no clear delimiters
#So - we'll add delimiters in an iterative process below and wrangle to 1 service per row.
list(facility_services['Services'].unique())

In [None]:
#I can manually start a collection of all unique services.
#Then, I can wrap a delimiter around the service, this time I'm choosing a caret - thank you CALPADS.
#As I add new services to the list, I use dwindle to surface the next service that needs to be added to the list.
#If we do this right, the unique list of services should == the manual collection of services.
#ISSUE 1: If we have "Lab: X-Ray" and "X-Ray", the list can't contain "X-Ray",
#otherwise it'll leave "Lab: ". We should do the longer of the two, then come back to the others.
#Known others: Radiology Services, Laboratory, Lab: Pathology, Family Planning, Surgical Intervetion, X-Ray
def wrap_with_caret(x, services):
    if type(x) == float:
        return x
    for i in services:
        if i in x:
            x = x.replace(i, '^'+i+'^')
    return x

def split_caret(x):
    if type(x) == float:
        return x
    x = x.split('^')
    return x

def keep_just_services(x):
    if type(x) == float:
        return [x]
    return [service for service in x if bool(service and not service.isspace())]

def dwindle(x, seen):
    """Reduce the number of services you have to look at by replacing ones you've already recorded with empty strings"""
    if type(x) == float:
        return x
    for i in seen:
        if i in x:
            x = x.replace(i, '')
    if x.isspace():
        return ''
    else:
        return x
    
def search_services(x, keywords):
    """Helper to search for occurrences of services if unsure so that I can paste the ID into url"""
    if type(x) == float:
        return False
    else:
        if keywords in x:
            return True
        else:
            return False
        
def create_long_df_rows(new_rows_list, df):
    """Create a list of list to create a long format of the df. Services were previously housed in a list
    format in df, but will be 'unpacked' so each service gets its own row and data type for the column is
    standardized to plain text."""
    for row in df.values.tolist():
        for service in row[2]:
            new_rows_list.append([row[0], row[1], service]) 
#test = facility_services.loc[[0,1,2,3]] #if need to test something

In [None]:
#Run this and keep refreshing/adding to the services_list doc with new services
services_list = pd.read_csv('services_list.csv')
list(facility_services['Services'].apply(lambda x: dwindle(x, services_list['services_list'])).unique())

In [None]:
#Search for keywords when needed
facility_services[facility_services.Services.apply(lambda x: search_services(x, 'Serives'))]

In [None]:
#Chain our applies for desired effects
facility_services['Services'] = facility_services['Services']\
                                .apply(lambda x: wrap_with_caret(x, services_list.services_list))\
                                .apply(lambda x: split_caret(x))\
                                .apply(lambda x: keep_just_services(x))
facility_services

In [None]:
#Create list of lists for the new rows
new_service_rows = []
create_long_df_rows(new_service_rows, facility_services)
#Create the long data frame
longer_facility_services = pd.DataFrame(new_service_rows, columns=facility_services.columns)

In [None]:
longer_facility_services

### Section 2: Pesky Overlaps for Services

In [None]:
#Great - the first round went super well! Now we have to get those pesky overlaps.
#Radiology Services, Laboratory, Lab: Pathology, Family Planning, Surgical Intervetion, X-Ray
#They seem to be grouped by Type of Service
longer_facility_services['Services'].unique()

In [None]:
longer_facility_services['Type of Service'].unique()

#### Section 2a: Confirming how they are grouped and which interventions might be fix the issues

In [None]:
#For X-Ray
tb_service = longer_facility_services[longer_facility_services['Type of Service'] == "TB_Diagnosis,Care_and_Treatment"]
tb_service['Services'].unique()

In [None]:
#For Surgical Intervetion (also a misspelling but at least it's a global misspelling ;) ) 
oral_service = longer_facility_services[longer_facility_services['Type of Service'] == "Oral_Health_Services(Dental_Serices)"]
oral_service['Services'].unique()

In [None]:
#For Family Planning
fam_service = longer_facility_services[longer_facility_services['Type of Service'] == "Reproductive_&_Child_Health_Care_Serices"]
fam_service['Services'].unique()

In [None]:
#For Diagnosis Services - Radiology Services, Laboratory, Lab: Pathology
#A little harder because sometimes they need to be split up from each other
#Easy win is to strip the space like the others first, then tackle the unique cases that need split:
#Laboratory Radiology Services, Laboratory Lab: Pathology, Lab: Pathology Radiology Services
#We can do a unique version of caret delimeter that looks at wrapping these cases in particular
fam_service = longer_facility_services[longer_facility_services['Type of Service'] == "Diagnosis_Serices"]
fam_service['Services'].unique()

#### Section 2b: Fixing the low hanging fruit by using .strip() to remove whitespaces around them. Re-run Section 2a to see changes

In [None]:
def remove_space(row, service_type, keyword):
    try:
        if row['Type of Service'] == service_type:
            if keyword in row['Services']:
                return row['Services'].strip()
            else:
                return row['Services']
        else:
            return row['Services']
    except KeyError:
        if row['Type of Infrastructure'] == service_type:
            if keyword in row['Value']:
                return row['Value'].strip()
            else:
                return row['Value']
        else:
            return row['Value']

In [None]:
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "TB_Diagnosis,Care_and_Treatment", "X-Ray" ), axis=1)

In [None]:
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "Oral_Health_Services(Dental_Serices)", "Surgical Intervetion" ), axis=1)

In [None]:
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "Reproductive_&_Child_Health_Care_Serices", "Family Planning" ), axis=1)

In [None]:
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "Diagnosis_Serices", "Radiology Services" ), axis=1)
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "Diagnosis_Serices", "Laboratory" ), axis=1)
longer_facility_services['Services'] = longer_facility_services.apply(lambda row: remove_space(row, "Diagnosis_Serices", "Lab: Pathology" ), axis=1)

In [None]:
longer_facility_services.shape

#### Section 2c: For Diagnosis Services, we need a step 2. Because edge cases are known, a specific edge case version of caret wrap should fix the issue. Then we can repeat the chaining steps that created longer_facility_services

In [None]:
def wrap_with_caret_v2(x):
    """For remaining Diagnosis Services edge cases
    Laboratory Radiology Services, 
    Laboratory Lab: Pathology, 
    Lab: Pathology Radiology Services"""
    if type(x) == float:
        return x
    if x == "Laboratory Radiology Services":
        x = "^Laboratory^ ^Radiology Services^"
    if x == "Laboratory Lab: Pathology":
        x = "^Laboratory^ ^Lab: Pathology^"
    if x == "Lab: Pathology Radiology Services":
        x = "^Lab: Pathology^ ^Radiology Services^"
    return x

In [None]:
longer_facility_services['Services'] = longer_facility_services['Services']\
                                        .apply(lambda x: wrap_with_caret_v2(x))\
                                        .apply(lambda x: split_caret(x))\
                                        .apply(lambda x: keep_just_services(x))

In [None]:
new_service_rows = []
create_long_df_rows(new_service_rows, longer_facility_services)
#Yay did it!
normalized_facility_services = pd.DataFrame(new_service_rows, columns=longer_facility_services.columns)

In [None]:
normalized_facility_services['Services'].unique()

In [None]:
normalized_facility_services.shape

### Section 3: Anything to clean up for infrastructure?

In [None]:
fac_infra_wide = facility[['Facility_ID', 'Reception_Room(s)',
       'Consultation_Room(s)', 'Dressing_Room(s)', 'Ward_Room(s)',
       'Injection_Room(s)', 'Observation_Room(s)', 'Remarks', 'Patient_Beds',
       'Deliery_Beds', 'Baby_Cots', 'Ambulances', 'Cars', 'Motorcycles',
       'Other_Transport', 'Sterilazation_and_Infection_Control',
       'Means_of_Transport_to_Referral_Point', 'Distance_to_Referral_Point',
       'Challanges/Remarks_to_Referral_Point', 'Source_of_Energy',
       'Other_Source_of_Energy', 'Mobile_Networks', 'Other_Mobile_Networks',
       'Source_of_Water', 'Other_Source_of_Water', 'Toilet_Facility',
       'Toilet_Remarks', 'Waste_Management', 'Other_Waste_Management']]

#To reverse the melt, df.pivot (or just use the original DFs)
facility_infrastructure = pd.melt(fac_infra_wide, id_vars='Facility_ID', var_name="Type of Infrastructure", value_name='Infrastructure')

In [None]:
facility_infrastructure['Type of Infrastructure'].unique()

#### Section 3a: Review the data to see if long data design makes sense

In [None]:
for i in list(facility_infrastructure['Type of Infrastructure'].unique()):
    print(i, facility_infrastructure[facility_infrastructure['Type of Infrastructure'] == i]['Infrastructure'].unique(), '\n')

#Source_of_Energy, Mobile_Networks, Source_of_Water, Toilet_Facility, Waste_Management are the only good candidates
#for a long facility_infrastructure design

In [None]:
#Might be worth creating a facility_infrastructure_a (just columns that can melt) and facility_infrastructure_b (wide version)
fac_infra_a_wide = facility[['Facility_ID', 'Source_of_Energy', 'Mobile_Networks',
          'Source_of_Water', 'Toilet_Facility', 'Waste_Management']]

facility_infrastructure_b = facility[['Facility_ID', 'Reception_Room(s)',
       'Consultation_Room(s)', 'Dressing_Room(s)', 'Ward_Room(s)',
       'Injection_Room(s)', 'Observation_Room(s)', 'Remarks', 'Patient_Beds',
       'Deliery_Beds', 'Baby_Cots', 'Ambulances', 'Cars', 'Motorcycles',
       'Other_Transport', 'Sterilazation_and_Infection_Control',
       'Means_of_Transport_to_Referral_Point', 'Distance_to_Referral_Point',
       'Challanges/Remarks_to_Referral_Point',
       'Other_Source_of_Energy', 'Other_Mobile_Networks', 'Other_Source_of_Water',
       'Toilet_Remarks', 'Other_Waste_Management']]

print(fac_infra_a_wide.shape)
print(facility_infrastructure_b.shape)
print(fac_infra_wide.shape)

#### Section 3b: Reformat facility_infrastructure_a

In [None]:
facility_infrastructure_a = pd.melt(fac_infra_a_wide, id_vars='Facility_ID', var_name="Type of Infrastructure",
                                  value_name="Value" )

In [None]:
facility_infrastructure_a['Value'].unique()

In [None]:
#Search for unclear ones if needed
facility_infrastructure_a[facility_infrastructure_a['Value'].apply(lambda x: search_services(x, "Flush/Pour"))]

In [None]:
#Load and refresh the infrastructure list to get unique services that should be wrapped with carets
infra_list = pd.read_csv("infrastructure_list.csv")
list(facility_infrastructure_a['Value'].apply(lambda v: dwindle(v, infra_list.infra_list)).unique())

In [None]:
#Chain the reformatting
facility_infrastructure_a['Value'] = facility_infrastructure_a['Value']\
                                        .apply(lambda x: wrap_with_caret(x, infra_list.infra_list))\
                                        .apply(lambda x: split_caret(x))\
                                        .apply(lambda x: keep_just_services(x))

In [None]:
new_rows = []
create_long_df_rows(new_rows, facility_infrastructure_a)
#Create new long DF
long_fac_infra_a = pd.DataFrame(new_rows, columns = facility_infrastructure_a.columns)

In [None]:
long_fac_infra_a

#### Section 3c: Two Overlaps: Other and Pit Latrine, use remove_spaces to fix both of them.

In [None]:
long_fac_infra_a['Type of Infrastructure'].unique()

In [None]:
#For Other
long_fac_infra_a[long_fac_infra_a['Type of Infrastructure'] == 'Source_of_Energy']['Value'].unique()

In [None]:
#For Pit Latrine
long_fac_infra_a[long_fac_infra_a['Type of Infrastructure'] == 'Toilet_Facility']['Value'].unique()

In [None]:
#Use remove_spaces to .strip() whitespaces. Re-run the two cells above to confirm all are now fixed.
long_fac_infra_a['Value'] = long_fac_infra_a.apply(lambda x: remove_space(x, "Source_of_Energy", "Other"), axis=1)
long_fac_infra_a['Value'] = long_fac_infra_a.apply(lambda x: remove_space(x, "Toilet_Facility", "Pit Latrine"), axis=1)

In [None]:
#### And done!
long_fac_infra_a.shape

### Section 4: Done and save your work!

In [None]:
### Save Your Work and upload to Kaggle
long_fac_infra_a.to_csv('./data/facility_infrastructure_a.csv', index=False)
facility_infrastructure_b.to_csv('./data/facility_infrastructure_b.csv', index=False)
normalized_facility_services.to_csv('./data/facility_services.csv', index=False)
facility_info.to_csv('./data/facility_info.csv', index=False)