In [1]:
import pandas as pd
import os
import sklearn

In [2]:
current_path = os.getcwd()
data_sources = os.path.join(current_path, '..', 'Data Sources')

In [16]:
#datasets we're using
real_estate = pd.read_csv(os.path.join(data_sources, 'RealEstate_Sample_Wards.csv'))
real_estate.rename(columns={"Ward_Index": "Ward_ID"},inplace=True)
real_estate['Ward_ID'] = real_estate['Ward_ID'].fillna(-1) #to fill any missing values
real_estate['Ward_ID'] = real_estate['Ward_ID'].astype(int)

In [17]:
transportation = pd.read_csv(os.path.join(data_sources, 'Transportation_Sample_Wards.csv'))
transportation.rename(columns={"Ward_Index": "Ward_ID"},inplace=True)
transportation['Ward_ID'] = transportation['Ward_ID'].fillna(-1)
transportation['Ward_ID'] = transportation['Ward_ID'].astype(int)

In [18]:
amenities = pd.read_csv(os.path.join(data_sources, 'Amenities_Sample_Ward.csv'))
amenities.rename(columns={"Ward_Index": "Ward_ID"},inplace=True)
amenities['Ward_ID'] = amenities['Ward_ID'].fillna(-1)
amenities['Ward_ID'] = amenities['Ward_ID'].astype(int)

In [19]:
social_dev = pd.read_csv(os.path.join(data_sources, 'Social_Development_Sample_Wards.csv'))
social_dev.rename(columns={"Ward_Index": "Ward_ID"},inplace=True)
social_dev['Ward_ID'] = social_dev['Ward_ID'].fillna(-1)
social_dev['Ward_ID'] = social_dev['Ward_ID'].astype(int)

In [27]:
#data cleaning from Phase 2
education_data2016 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2016.csv"),
    skiprows=range(833),
    nrows=16,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)
education_data2021 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2021.csv"),
    skiprows=range(978),
    nrows=17,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)

# rename 'Education' column to 'Education_Level' before melting
education_data2016.rename(columns={"Education": "Education_Level"}, inplace=True)
education_data2021.rename(columns={"Education": "Education_Level"}, inplace=True)

# filter out rows where 'Education_Level' column is not empty
education_data2016 = education_data2016[education_data2016["Education_Level"].notna()]
education_data2021 = education_data2021[education_data2021["Education_Level"].notna()]

# remove all spaces in education_level column
education_data2016["Education_Level"] = education_data2016[
    "Education_Level"
].str.strip()
education_data2021["Education_Level"] = education_data2021[
    "Education_Level"
].str.strip()

# Melt the DataFrame to get 'Ward_ID', 'Education_Level', and 'Population' columns
education_data2016 = pd.melt(
    education_data2016,
    id_vars=["Education_Level"],
    var_name="Ward_ID",
    value_name="Population",
)
education_data2016["Year"] = 2016
education_data2021 = pd.melt(
    education_data2021,
    id_vars=["Education_Level"],
    var_name="Ward_ID",
    value_name="Population",
)
education_data2021["Year"] = 2021

# merge the datasets
Education = pd.concat(
    [education_data2016, education_data2021], ignore_index=True
)

# change data types
Education["Population"] = Education["Population"].astype(int)
Education["Education_Level"] = Education["Education_Level"].astype(
    str
)

In [26]:
# INDUSTRY DIMENSION
industry_data2016 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2016.csv"),
    skiprows=range(1176),
    nrows=22,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)
industry_data2021 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2021.csv"),
    skiprows=range(1310),
    nrows=22,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)

# Filter out rows where 'Industry' column is not empty
industry_data2016 = industry_data2016[industry_data2016["Industry"].notna()]
industry_data2021 = industry_data2021[industry_data2021["Industry"].notna()]

# Removing the numbers and spaces before each industry type
industry_data2016["Industry"] = industry_data2016["Industry"].str.strip()
industry_data2021["Industry"] = industry_data2021["Industry"].str.strip()
industry_data2016["Industry"] = industry_data2016["Industry"].str.replace(
    r"^\s*\d+(-\d+)?\s+", "", regex=True
)
industry_data2021["Industry"] = industry_data2021["Industry"].str.replace(
    r"^\s*\d+(-\d+)?\s+", "", regex=True
)

# Melt the DataFrame to get 'Ward_ID', 'Industry', and 'Population' columns
industry_data2016 = pd.melt(
    industry_data2016, id_vars=["Industry"], var_name="Ward_ID", value_name="Population"
)
industry_data2016["Year"] = 2016
industry_data2021 = pd.melt(
    industry_data2021, id_vars=["Industry"], var_name="Ward_ID", value_name="Population"
)
industry_data2021["Year"] = 2021

# merge the datasets
IndustryDimension = pd.concat([industry_data2016, industry_data2021], ignore_index=True)

# change data types
IndustryDimension["Population"] = IndustryDimension["Population"].astype(int)
IndustryDimension["Ward_ID"] = IndustryDimension["Ward_ID"].astype(str)
IndustryDimension["Industry"] = IndustryDimension["Industry"].astype(str)

In [25]:
# INCOME DIMENSION
income_data2016 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2016.csv"),
    skiprows=range(1252),
    nrows=17,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)
income_data2021 = pd.read_csv(
    os.path.join(data_sources,"2021.csv"),
    skiprows=range(1389),
    nrows=17,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)

# Filter out rows where 'Income' column is not NA
income_data2016 = income_data2016[income_data2016["Income"].notna()]
income_data2021 = income_data2021[income_data2021["Income"].notna()]

# change value of one of income ranges
income_data2016.loc[
    income_data2016["Income"].str.contains("Total - Total income groups"), "Income"
] = "Total Income Groups"
income_data2021.loc[
    income_data2021["Income"].str.contains("Total - Total Income groups"), "Income"
] = "Total Income Groups"

# Removing the spaces before each income
income_data2016["Income"] = income_data2016["Income"].str.strip()
income_data2021["Income"] = income_data2021["Income"].str.strip()

# Melt the DataFrame to get 'Ward_ID', 'Income', and 'Population' columns
income_data2016["Year"] = 2016
income_data2021["Year"] = 2021
income_data2016 = pd.melt(
    income_data2016,
    id_vars=["Income", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
income_data2021 = pd.melt(
    income_data2021,
    id_vars=["Income", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
columns_order = [col for col in income_data2016.columns if col != "Year"] + ["Year"]
income_data2016 = income_data2016[columns_order]
income_data2021 = income_data2021[columns_order]

# merge the datasets
IncomeDimension = pd.concat([income_data2016, income_data2021], ignore_index=True)

# change data types
IncomeDimension["Population"] = IncomeDimension["Population"].astype(int)
IncomeDimension["Ward_ID"] = IncomeDimension["Ward_ID"].astype(str)
IncomeDimension["Income"] = IncomeDimension["Income"].astype(str)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/serenaiyoha/Documents/CSI4142/CSI4142-Gentrification-Project/Phase4/../Data Sources/2021.csv'

In [24]:
# ETHNOCULTURAL DIMENSION
ethnicity_data2016 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2016.csv"),
    skiprows=range(851),
    nrows=280,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)
ethnicity_data2021 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2021.csv"),
    skiprows=range(1013),
    nrows=252,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)

# Filter out rows where 'Ethnocultural' column is not NA
ethnicity_data2016 = ethnicity_data2016[ethnicity_data2016["Ethnoculture"].notna()]
ethnicity_data2021 = ethnicity_data2021[ethnicity_data2021["Ethnoculture"].notna()]

# change value of one of ethnic ranges
ethnicity_data2016.loc[
    ethnicity_data2016["Ethnoculture"].str.contains("Total - Ethnic origin"),
    "Ethnoculture",
] = "Total Ethnic Origin"
ethnicity_data2021.loc[
    ethnicity_data2021["Ethnoculture"].str.contains("Total - Ethnic origin"),
    "Ethnoculture",
] = "Total Ethnic Origin"

# Removing the spaces before each ethnicity
ethnicity_data2016["Ethnoculture"] = ethnicity_data2016["Ethnoculture"].str.strip()
ethnicity_data2021["Ethnoculture"] = ethnicity_data2021["Ethnoculture"].str.strip()

# Melt the DataFrame to get 'Ward_ID', 'Ethnocultural', and 'Population' columns
ethnicity_data2016["Year"] = 2016
ethnicity_data2021["Year"] = 2021
ethnicity_data2016 = pd.melt(
    ethnicity_data2016,
    id_vars=["Ethnoculture", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
ethnicity_data2021 = pd.melt(
    ethnicity_data2021,
    id_vars=["Ethnoculture", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
columns_order = [col for col in ethnicity_data2016.columns if col != "Year"] + ["Year"]
ethnicity_data2016 = ethnicity_data2016[columns_order]
ethnicity_data2021 = ethnicity_data2021[columns_order]

# merge the datasets
EthnoculturalDimension = pd.concat(
    [ethnicity_data2016, ethnicity_data2021], ignore_index=True
)

# change data types
EthnoculturalDimension["Population"] = EthnoculturalDimension["Population"].astype(int)
EthnoculturalDimension["Ward_ID"] = EthnoculturalDimension["Ward_ID"].astype(str)
EthnoculturalDimension["Ethnoculture"] = EthnoculturalDimension["Ethnoculture"].astype(str)

In [21]:
# HOUSEHOLD DIMENSION
household_data2016 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2016.csv"),
    skiprows=range(98),
    nrows=9,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)
household_data2021 = pd.read_csv(
    os.path.join(data_sources,"WardProfile2021.csv"),
    skiprows=range(108),
    nrows=9,
    header=0,
    encoding="ISO-8859-1",
    low_memory=False,
)

# Filter out rows where 'Household' column is not NA
household_data2016 = household_data2016[household_data2016["Household"].notna()]
household_data2021 = household_data2021[household_data2021["Household"].notna()]

# change value of one of income ranges
household_data2016.loc[
    household_data2016["Household"].str.contains(
        "Total - Private households by household"
    ),
    "Household",
] = "Total Household"
household_data2021.loc[
    household_data2021["Household"].str.contains(
        "Total - Private households by household"
    ),
    "Household",
] = "Total Household"


# Removing the spaces before each household type
household_data2016["Household"] = household_data2016["Household"].str.strip()
household_data2021["Household"] = household_data2021["Household"].str.strip()

# Melt the DataFrame to get 'Ward_ID', 'Household', and 'Population' columns
household_data2016["Year"] = 2016
household_data2021["Year"] = 2021
household_data2016 = pd.melt(
    household_data2016,
    id_vars=["Household", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
household_data2021 = pd.melt(
    household_data2021,
    id_vars=["Household", "Year"],
    var_name="Ward_ID",
    value_name="Population",
)
columns_order = [col for col in household_data2016.columns if col != "Year"] + ["Year"]
household_data2016 = household_data2016[columns_order]
household_data2021 = household_data2021[columns_order]

# Merge the datasets
HouseholdDimension = pd.concat(
    [household_data2016, household_data2021], ignore_index=True
)


# change data types
HouseholdDimension["Ward_ID"] = HouseholdDimension["Ward_ID"].astype(str)
HouseholdDimension["Household"] = HouseholdDimension["Household"].astype(str)

# rename column
HouseholdDimension.rename(columns={"Household": "Household_Description"}, inplace=True)

In [23]:
# SHELTER DIMENSION
# 2021
# Load the CSV file
df = pd.read_csv(os.path.join(data_sources,"WardProfile2021.csv"), encoding="ISO-8859-1", low_memory=False)

# Define the indices
tenant_costs_index = 1373
owner_costs_index = 1377

tenant_percent_spending = 1374
owner_percent_spending = 1378

tenant_households = 1372
owner_households = 1376

# create the Ward_IDs list based on the column headers
ward_ids = df.columns[1:]

tenant_data = []
owner_data = []

for i, ward_id in enumerate(ward_ids):
    tenant_data.append(
        {
            "Household_Type": "Tenant",
            "Ward_ID": ward_id,
            "Total_Households": df.iloc[tenant_households, i + 1],
            "Average_Monthly_Shelter_Costs": df.iloc[tenant_costs_index, i + 1],
            "Percent_Spending_30_Percent_Or_More_On_Shelter": df.iloc[
                tenant_percent_spending, i + 1
            ],
        }
    )
    owner_data.append(
        {
            "Household_Type": "Owner",
            "Ward_ID": ward_id,
            "Total_Households": df.iloc[owner_households, i + 1],
            "Average_Monthly_Shelter_Costs": df.iloc[owner_costs_index, i + 1],
            "Percent_Spending_30_Percent_Or_More_On_Shelter": df.iloc[
                owner_percent_spending, i + 1
            ],
        }
    )

# Combine the tenant and owner data
combined_data = tenant_data + owner_data
# print(combined_data[:20])
# 2016
# Load the CSV file
df1 = pd.read_csv(os.path.join(data_sources,"WardProfile2021.csv"), encoding="ISO-8859-1", low_memory=False)

# Define the indices
tenant_costs_index = 1239
owner_costs_index = 1243

tenant_percent_spending = 1240
owner_percent_spending = 1244

tenant_households = 1238
owner_households = 1242

# Create the Ward_IDs list based on the column headers
ward_ids = df1.columns[1:]

tenant_data = []
owner_data = []

for i, ward_id in enumerate(ward_ids):
    tenant_data.append(
        {
            "Household_Type": "Tenant",
            "Ward_ID": ward_id,
            "Total_Households": df1.iloc[tenant_households, i + 1],
            "Average_Monthly_Shelter_Costs": df1.iloc[tenant_costs_index, i + 1],
            "Percent_Spending_30_Percent_Or_More_On_Shelter": df1.iloc[
                tenant_percent_spending, i + 1
            ],
        }
    )
    owner_data.append(
        {
            "Household_Type": "Owner",
            "Ward_ID": ward_id,
            "Total_Households": df1.iloc[owner_households, i + 1],
            "Average_Monthly_Shelter_Costs": df1.iloc[owner_costs_index, i + 1],
            "Percent_Spending_30_Percent_Or_More_On_Shelter": df1.iloc[
                owner_percent_spending, i + 1
            ],
        }
    )
# Combine the tenant and owner data
combined_data2 = tenant_data + owner_data
# print(combined_data2[:20])
# create the DataFrame
ShelterDimension2016 = pd.DataFrame(combined_data2)
ShelterDimension2021 = pd.DataFrame(combined_data)

ShelterDimension2016["Year"] = 2016
ShelterDimension2021["Year"] = 2021

# merge the datasets
ShelterDimension = pd.concat(
    [ShelterDimension2016, ShelterDimension2021], ignore_index=True
)

# clean data and change data types
ShelterDimension["Average_Monthly_Shelter_Costs"] = (
    ShelterDimension["Average_Monthly_Shelter_Costs"]
    .str.replace("$", "")
    .str.replace(",", "")
    .astype(int)
)
ShelterDimension["Percent_Spending_30_Percent_Or_More_On_Shelter"] = (
    ShelterDimension["Percent_Spending_30_Percent_Or_More_On_Shelter"]
    .str.replace("%", "")
    .astype(float)
)
ShelterDimension["Total_Households"] = ShelterDimension["Total_Households"].astype(int)
ShelterDimension["Household_Type"] = ShelterDimension["Household_Type"].astype(str)
ShelterDimension["Ward_ID"] = ShelterDimension["Ward_ID"].astype(str)


  ShelterDimension["Average_Monthly_Shelter_Costs"]
