### Obligatory imports

In [None]:
import pandas as pd
import time

### Download from the newest set of facilities from HFR and dump in this folder. Read it with pandas.

In [None]:
#Database of all the facility IDs, concatenate the variable value to URL.
#Click Excel Export here: http://hfrportal.ehealth.go.tz/index.php?r=facilities/facilitiesList

url = 'http://hfrportal.ehealth.go.tz/index.php?r=facilities/view&facility_id='
db = pd.read_excel('./Facility_List.xlsx', header = 2)


In [None]:
db.head()

## Run the thing - takes approx. 15-17 hours

In [None]:
#Using the IDs from the download, scrape all the websites.

transposed = []
failed = []
print('Running through the facility numbers now...')
counter = 0
for e in db['Facility Number']: #e is ID, ~7950 ids total
    if counter > 0:
        try:
            #When adding more rows, we shouldn't concatenate with the headers every time.
            # We need to treat subsequent dfs, slightly differently.
            time.sleep(3)
            table2 = pd.read_html(url+e) #(url+ID)
            twoTable = pd.concat(table2, ignore_index=True) #Every dataframe, must have an ID column added to it.
            twoTable.loc[-1] = 'Facility_ID', e #Assign the ID to the last row
            transposed.append(twoTable.transpose().drop([0])) #Add this to a list to be concat'd, doesn't have header
            time.sleep(1)
            counter += 1
            
            if counter % 100 == 0:
                print(counter)
        
        except Exception as i:
            print(i)
            failed.append(url+e)
            time.sleep(120)
            continue
    else:
        table = pd.read_html(url+e) #(url+ID)
        oneTable = pd.concat(table, ignore_index=True) #Every dataframe, must have an ID column added to it.
        oneTable.loc[-1] = 'Facility_ID', e #Assign the ID to the last row
        transposed.append(oneTable.transpose()) #Add this to a list to be concat'd
        counter += 1

fullFacility = pd.concat(transposed, ignore_index=True)
facility_columns = fullFacility.loc[0, :].tolist()

for e in facility_columns:
    facility_columns[facility_columns.index(e)] = e.replace(" ", "_")
    e = e.replace(" ", "_")
    facility_columns[facility_columns.index(e)] = e.replace("'s", "")

fullFacility.columns = facility_columns
fullFacility.drop([0], inplace=True)
fullFacility.reset_index(drop=True)
ordered_columns = [facility_columns[-1]]+facility_columns[:-1]
fullFacility[ordered_columns].to_csv('./Health_Facilities_2018-09-28.csv', index=False)

## Testing concat behavior

In [None]:
#Testing concat behavior
test = ['107930-0', '111529-4', '107932-6']
transposed = []
for i in test:
    table2 = pd.read_html(url+i)
    twoTable = pd.concat(table2, ignore_index=True)
    twoTable.loc[-1] = 'Facility_ID', 12
    transposed.append(twoTable.transpose())

twoTable

test = pd.concat(transposed, ignore_index=True)
test.drop([0], inplace = True)
test

### IF ANY FAIL, USE THIS

In [None]:
#If any fail, use this:
import re

transpose_fail = []
for fail in failed:    
    tables = pd.read_html(fail) #(url+ID) has multiple tables on page that we need to pull
    Table = pd.concat(tables, ignore_index=True) #Every dataframe, must have an ID column added to it.
    e = re.findall("[0-9\-]+", fail)[0]
    Table.loc[-1] = 'Facility_ID', e #Assign the ID to the last row
    transpose_fail.append(Table.transpose().drop([0])) #Add this to a list to be concat'd, doesn't have header

fullFacility = pd.concat(transposed+transpose_fail, ignore_index=True)
facility_columns = fullFacility.loc[0, :].tolist()

for e in facility_columns:
    facility_columns[facility_columns.index(e)] = e.replace(" ", "_")
    e = e.replace(" ", "_")
    facility_columns[facility_columns.index(e)] = e.replace("'s", "")

fullFacility.columns = facility_columns
fullFacility.drop([0], inplace=True)
fullFacility.reset_index(drop=True)
ordered_columns = [facility_columns[-1]]+facility_columns[:-1]
fullFacility[ordered_columns].to_csv('./Health_Facilities_2018-09-29.csv', index=False)

### Merge the original and scraped dataframes for a complete dataset

In [None]:
#It's merging time!
merge_df = db[['Facility Number', 'Facility Name', 'Common Name', 'Ward', 'Village/Street', 'Facility Type', 'Operating Status', 'Ownership', 
 'Latitude', 'Longitude']] #Taking subset of original dataframe columns that are not duplicates

#Match ID column name for merging
merge_columns = merge_df.columns.tolist()
merge_columns[0] = 'Facility_ID'
merge_df.columns = merge_columns

combined_data = pd.merge(merge_df, fullFacility[ordered_columns], how='inner', on='Facility_ID')
del combined_data['Geo-coordinates(Latitude,Longitude)'] #Remove duplicate and wrongly formatted coordinate column
combined_data.to_csv('./Health_Facilities_2018-09-30.csv', index=False)