In [1]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
def tokenize_and_normalize(text):
    # Tokenize by splitting the text
    tokens = text.split()
    
    # Normalize by converting to lowercase and removing stop words
    normalized_tokens = [token.lower() for token in tokens if token.lower() not in ENGLISH_STOP_WORDS]
    
    return set(normalized_tokens)

In [4]:
# Load the datasets
df_livability = pd.read_csv("../Data/city-of-melbourne-liveability-and-social-indicators.csv")
df_suburb = pd.read_csv("../Data/Region summary_ Melbourne CBD - East SA2 206041503.csv")


In [5]:
# Extract the relevant columns
livability_indicators = df_livability["Indicator"].tolist()
suburb_attributes = df_suburb["Description"].tolist()

In [6]:
matches = {}

for attribute in suburb_attributes:
    attribute_tokens = tokenize_and_normalize(attribute)
    
    for indicator in livability_indicators:
        indicator_tokens = tokenize_and_normalize(indicator)
        
        # If there's a significant overlap, consider it a potential match
        overlap = attribute_tokens.intersection(indicator_tokens)
        
        if len(overlap) > 1:  # You can adjust this threshold as needed
            if attribute not in matches:
                matches[attribute] = []
            matches[attribute].append(indicator)

# Print the matches
for attribute, indicators in matches.items():
    print(f"{attribute} potentially matches with:")
    for indicator in indicators:
        print(f"  - {indicator}")
    print()

Population density (persons/km2) potentially matches with:
  - Population density per square kilometre (km2)
  - Population density per square kilometre (km2)
  - Population density per square kilometre (km2)
  - Population density per square kilometre (km2)

Working age population (aged 15-64 years) (no.) potentially matches with:
  - Percentage of population that are adults (25-64 years)
  - Percentage of population that are adults (25-64 years)
  - Population dependency ratio (people aged under 15 years or over 64 years)
  - Population dependency ratio (people aged under 15 years or over 64 years)
  - Population dependency ratio (people aged under 15 years or over 64 years)
  - Population dependency ratio (people aged under 15 years or over 64 years)
  - Percentage of population that are youth (15-24 years)
  - Percentage of population that are children (0-14 years)
  - Percentage of population that are children (0-14 years)
  - Number of registered voters as a percentage of the vot

In [7]:
from fuzzywuzzy import fuzz

In [10]:
# Define a threshold for considering a match
threshold = 60  # This means 70% similarity; you can adjust as needed

matches = {}

for attribute in suburb_attributes:
    for indicator in livability_indicators:
        similarity = fuzz.ratio(attribute, indicator)
        
        if similarity > threshold:
            if attribute not in matches:
                matches[attribute] = []
            matches[attribute].append((indicator, similarity))

# Print the matches
for attribute, indicators in matches.items():
    print(f"{attribute} potentially matches with:")
    for indicator, similarity in indicators:
        print(f"  - {indicator} (Similarity: {similarity}%)")
    print()



Population density (persons/km2) potentially matches with:
  - Population density per square kilometre (km2) (Similarity: 73%)
  - Population density per square kilometre (km2) (Similarity: 73%)
  - Population density per square kilometre (km2) (Similarity: 73%)
  - Population density per square kilometre (km2) (Similarity: 73%)

Working age population (aged 15-64 years) (no.) potentially matches with:
  - Percentage of population that are adults (25-64 years) (Similarity: 63%)
  - Percentage of population that are adults (25-64 years) (Similarity: 63%)
  - Percentage of population that are youth (15-24 years) (Similarity: 62%)
  - Percentage of population that are adults (25-64 years) (Similarity: 63%)
  - Percentage of population that are adults (25-64 years) (Similarity: 63%)
  - Percentage of population that are youth (15-24 years) (Similarity: 62%)
  - Percentage of population that are youth (15-24 years) (Similarity: 62%)
  - Percentage of population that are youth (15-24 years) 

## Features for livability (Via script)
- Population density (persons/km2) is shown in the indicators as a metric of livability
- Working age population (aged 15-64 years) is broken down into smaller age groups in the inficator data
- Number of employing businesses
- Total number of businesses
- Unemployment rate (%)
- Total households (no.)

## Features of livability (Possible)
- Persons living in appropriately sized dwellings (Important)
- Persons living in a dwelling requiring X additional bedrooms (Important)
- Retail trade (no.)
- Health care and social assistance (no.) (Important)
- Accommodation and food services (no.) (Super Important)
- Completed year 12 or equivalent (%)
- Bachelor degree (%)

## Livability indicators not in SA2 data
- Crimes against property (Important)
- Violent crime rate
- Percentage of country GDP (Change to percentage of State GDP)
- Annual population change (Important)
- Personal Wellbeing Index

## Possible indicators
- Weather
- City Budgets

## Addordability indicators
- Median Rent Price
- Rent-to-Income Ratio
- Rental Stress: It's the percentage of households spending more than 30% of their pre-tax income on rent.
- Vacancy Rates: The percentage of all available rental properties that are vacant or unoccupied. A lower rate can indicate high demand for rentals, which can drive up prices.
- Housing Supply: The number of new housing units approved or constructed in a given period. An increase in supply can potentially lead to decreased prices, improving affordability.