In [1]:
# Analysis of Scraped council data 
import pandas as pd
df = pd.read_csv("./data/cu_export_all.csv")
df.shape

  df = pd.read_csv("./data/cu_export_all.csv")


(14352, 79)

In [2]:
from helpers.file_helper import FileHelper
import pandas as pd

# read scraped council data
output_file_path= "./data/cu_export_all_scraped_new.jsonl"
file_helper = FileHelper()
output_records = file_helper.read_jsonlines_all(output_file_path)
#filter by council name
council_name = "City of Burnside"
filter_by_council = False
if filter_by_council:
    output_records = [o for o in output_records if o.get("council") == council_name]
output_records_df = pd.DataFrame(output_records)

# Number of data available
total_records = output_records_df.shape[0]
print("Total records ", total_records)

Total records  14352


In [3]:
# columns in the data
output_records_df.columns

Index(['org_id', 'address', 'council', 'electorate_state',
       'electorate_federal', 'error_message', 'has_error', 'council_scraped',
       'electorate_state_scraped', 'is_council_correct',
       'is_electorate_state_correct', 'scraped_text'],
      dtype='object')

In [4]:
# 1. Errors
# 1.1 Address is empty in the sacommunity data
# Solution: Add address. Search the website or contact the person for address
empty_address_df = output_records_df[output_records_df["address"] == ""]
empty_address_df.shape
empty_address_count = empty_address_df.shape[0]
print("Empty address count ", empty_address_count)

Empty address count  50


In [5]:
empty_address_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
7,193943,,District Council of Ceduna,Flinders,Grey,address is null or empty,True,,,False,False,
9,193945,,Wattle Range Council,MacKillop,Barker,address is null or empty,True,,,False,False,
40,193982,,Roxby Downs,Giles,Grey,address is null or empty,True,,,False,False,
54,194002,,City of Adelaide,Adelaide,Adelaide,address is null or empty,True,,,False,False,
56,194004,,City of Adelaide,Adelaide,Adelaide,address is null or empty,True,,,False,False,


In [7]:
# Errors. Exception
# other erros than empty address
# These erros could be scraping is forbidden in the url, too many requests, internal server error, etc
# Solution: check for error message, and retry scraping
errors_df = output_records_df[(output_records_df['has_error'] == True) & (output_records_df["address"] != "")]
errors_df.shape
errors_count = errors_df.shape[0]
print("Exceptions count ", errors_count)

Exceptions count  0


In [8]:
errors_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text


In [9]:
# Address is not recognised as a valid address by the source of scraping
# Update address to a valid one. Search the website or contact the person for address
address_not_found_df = output_records_df[output_records_df["scraped_text"].str.startswith("No results found")]
address_not_found_df.shape
invalid_address_count = address_not_found_df.shape[0]
print("Invalid address count ", invalid_address_count)

Invalid address count  349


In [10]:
address_not_found_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
0,193932,"RSL Hall, West Tce",Yorke Peninsula Council,Narungga,Grey,,False,,,False,False,No results found. Please note this tool only s...
10,193936,Egerton St,Mid Murray Council,Chaffey,Barker,,False,,,False,False,No results found. Please note this tool only s...
11,193940,Main St,Wakefield Regional Council,Frome,Grey,,False,,,False,False,No results found. Please note this tool only s...
12,193946,Third St,District of Franklin Harbour,Flinders,Grey,,False,,,False,False,No results found. Please note this tool only s...
15,193948,Cnr Midway Rd & Halsey Rd,City of Playford,Elizabeth,Spence,,False,,,False,False,No results found. Please note this tool only s...


In [11]:
# Council name mismatch
# Address has value and scraped data returned council name. This means wrongly recorded in the website
# Solution: Update the council name, and corresponding electoral state and federal state
council_name_mismatch_df = output_records_df[(output_records_df["is_council_correct"] == False) 
                                             & (output_records_df["address"] != "")
                                             & (~output_records_df["scraped_text"].str.startswith("No results found"))]
council_name_mismatch_df.shape
council_name_mismatch_count = council_name_mismatch_df.shape[0]
print("council_name_mismatch_count ",council_name_mismatch_count)

council_name_mismatch_count  1558


In [12]:
council_name_mismatch_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
2,193934,21 Sturt St,Berri Barmera Council,Chaffey,Barker,,False,City of Marion,WARRIPARINGA WARD,False,False,City of Marion\nCouncil Name City of Marion\nE...
8,193944,Edward St,District Council of Coober Pedy,Giles,Grey,,False,City of Marion,WOODLANDS WARD,False,False,City of Marion\nCouncil Name City of Marion\nE...
13,193949,2 Railway Pde,Regional Council of Goyder,Stuart,Grey,,False,Outback Communities,,False,False,Outback Communities\nCouncil Name Outback Comm...
14,193950,Hanson St,Light Regional Council,Schubert,Barker,,False,Regional Council of Goyder,BURRA WARD,False,False,Regional Council of Goyder\nCouncil Name Regio...
18,193955,7 Bruce Ave,Tatiara District Council,MacKillop,Barker,,False,District Council of Mount Remarkable,WILLOCHRA WARD,False,False,District Council of Mount Remarkable\nCouncil ...


In [None]:
# TODO: This seems some error
# https://www.lga.sa.gov.au/sa-councils/councils-listing#map returns electoral ward
# Need investigation betwen electoral state and electoral ward
# Electorate State mismatch
# Address has value and scraped data returned council name. Council name is correct, but electorate state is wrong
# Solution: Update the council name, and corresponding electoral state and federal state
# electorate_state_mismatch_df = output_records_df[(output_records_df["is_electorate_state_correct"] == False) 
#                                              & (output_records_df["address"] != "")
#                                              & (~output_records_df["scraped_text"].str.startswith("No results found"))]
# electorate_state_mismatch_df.shape

In [None]:
# electorate_state_mismatch_df.head(2)

In [21]:
# Summary
print("Summary of scraped council names on date November 6, 2024")
print("Total records: ", total_records)
print("Empty address count: ", empty_address_count)
print("Exceptions count (Network or scraping exceptions): ", errors_count)
print("Invalid address count: ", invalid_address_count)
print("Mismatch Council names count: ",council_name_mismatch_count)

Summary of scraped council names on date November 6, 2024
Total records:  14352
Empty address count:  50
Exceptions count (Network or scraping exceptions):  0
Invalid address count:  349
Mismatch Council names count:  1558


In [16]:
# Retry for
# No address found
# Exception
from helpers.string_helper import StringHelper
import pandas as pd

string_helper = StringHelper()
output_records_to_retry = []
for o in output_records:
    # print(o)
    # No address found
    if o.get("scraped_text").startswith("No results found."):
        output_records_to_retry.append(o)

    # Exceptions
    if o.get("has_error", False) and not string_helper.is_null_or_whitespace(o.get("address")):
        output_records_to_retry.append(o)

output_records_to_retry_df = pd.DataFrame(output_records_to_retry)
output_records_to_retry_df.shape

(349, 12)

In [17]:
output_records_to_retry_df.head()

Unnamed: 0,org_id,address,council,electorate_state,electorate_federal,error_message,has_error,council_scraped,electorate_state_scraped,is_council_correct,is_electorate_state_correct,scraped_text
0,193932,"RSL Hall, West Tce",Yorke Peninsula Council,Narungga,Grey,,False,,,False,False,No results found. Please note this tool only s...
1,193936,Egerton St,Mid Murray Council,Chaffey,Barker,,False,,,False,False,No results found. Please note this tool only s...
2,193940,Main St,Wakefield Regional Council,Frome,Grey,,False,,,False,False,No results found. Please note this tool only s...
3,193946,Third St,District of Franklin Harbour,Flinders,Grey,,False,,,False,False,No results found. Please note this tool only s...
4,193948,Cnr Midway Rd & Halsey Rd,City of Playford,Elizabeth,Spence,,False,,,False,False,No results found. Please note this tool only s...
