In [506]:
import pandas as pd
from sqlalchemy import create_engine, text

In [507]:
# Extracting building permit data
buildingPermits = pd.read_csv('clearedpermits2016.csv',low_memory=False)
buildingPermitsActive = pd.read_csv('building-permits-active-permits.csv', low_memory=False)
buildingPermits2017 = pd.read_csv('Cleared Building Permits since 2017.csv', low_memory=False)

# Extracting demolition data
demolition = pd.read_csv('DemolitionDimension.csv',low_memory=False)

In [508]:
#standardizing dataframes to prepare for merge
buildingPermitsActive.drop(['_id', 'REVISION_NUM'], axis=1, inplace=True)
buildingPermits.drop(['REVISION_NUM'], axis=1, inplace=True)
buildingPermits2017.drop(['_id', 'REVISION_NUM'], axis=1, inplace=True)

buildingPermitsActive['COMPLETED_DATE'] = pd.to_datetime(buildingPermitsActive['COMPLETED_DATE'])
buildingPermits['COMPLETED_DATE'] = pd.to_datetime(buildingPermits['COMPLETED_DATE'])
buildingPermits2017['COMPLETED_DATE'] = pd.to_datetime(buildingPermits2017['COMPLETED_DATE'])

#merge the datasets
combined_buildingPermits = pd.concat([buildingPermits, buildingPermitsActive, buildingPermits2017], ignore_index=True)

In [509]:
# Print a Series with the data type of each column
print(combined_buildingPermits.dtypes)

PERMIT_NUM                                object
PERMIT_TYPE                               object
STRUCTURE_TYPE                            object
WORK                                      object
STREET_NUM                                object
STREET_NAME                               object
STREET_TYPE                               object
STREET_DIRECTION                          object
POSTAL                                    object
GEO_ID                                   float64
WARD_GRID                                 object
APPLICATION_DATE                          object
ISSUED_DATE                               object
COMPLETED_DATE                    datetime64[ns]
STATUS                                    object
DESCRIPTION                               object
CURRENT_USE                               object
PROPOSED_USE                              object
DWELLING_UNITS_CREATED                   float64
DWELLING_UNITS_LOST                      float64
EST_CONST_COST      

In [510]:
#dropping columns that are not needed
columns_to_drop = ['CURRENT_USE', 'PROPOSED_USE', 'DWELLING_UNITS_CREATED', 'DWELLING_UNITS_LOST']
start_col = 'ASSEMBLY'
end_col = 'BUILDER_NAME'

start_col_index = combined_buildingPermits.columns.get_loc(start_col)
end_col_index = combined_buildingPermits.columns.get_loc(end_col) + 1  # +1 to include 'BUILDER_NAME'

# extend the list with the range of columns
columns_to_drop.extend(combined_buildingPermits.columns[start_col_index:end_col_index])

# drop the columns
combined_buildingPermits.drop(columns=columns_to_drop, axis=1, inplace=True)


In [511]:
# Print a Series with the data type of each column
print(combined_buildingPermits.dtypes)

PERMIT_NUM                  object
PERMIT_TYPE                 object
STRUCTURE_TYPE              object
WORK                        object
STREET_NUM                  object
STREET_NAME                 object
STREET_TYPE                 object
STREET_DIRECTION            object
POSTAL                      object
GEO_ID                     float64
WARD_GRID                   object
APPLICATION_DATE            object
ISSUED_DATE                 object
COMPLETED_DATE      datetime64[ns]
STATUS                      object
DESCRIPTION                 object
EST_CONST_COST              object
dtype: object


In [512]:
# analyzing which structure types, statuses, and work types are useful in analysis

print("STRUCTURE TYPES:")
unique_structure_types = combined_buildingPermits['STRUCTURE_TYPE'].unique()
print(unique_structure_types)

print("WORK TYPES:")
unique_work_types = combined_buildingPermits['WORK'].unique()
print(unique_work_types)



STRUCTURE TYPES:
['SFD - Detached' 'Office' 'SFD: P/D/F/E/R Drains' 'SFD - Semi-Detached'
 'Group A & B' 'Restaurant Greater Than 30 Seats'
 'Industrial Warehouse/Hazardous Building' 'Commercial/Industrial Use'
 ' Mixed Comm/Res ' 'Electromagnetic Locks' 'Apartment Building'
 'HVAC Alt. add on Sys. or Ductwork Alt.' 'Nursing Home Facility'
 'Multiple Unit Building' 'Home for the Aged' 'Retaining Wall'
 'SFD Garages' 'Retail Store' 'Open Public Swimming Pool'
 'Mixed Use/Res w Non Res' 'Piping(all other bldgs):Outside Water..'
 'Elementary School' 'Secondary School'
 'P/D/F/E/R Drains: all other buildings' 'Non-Residential Building'
 'Hospital' 'Hair, Barber and Other Salon'
 'Parking Garage Repairs (all other)' 'SFD - Townhouse'
 'Restaurant 30 Seats or Less ' 'Grandstand' 'Residential Porches'
 'Residential Decks' 'Repair Garage'
 'College/Trade/Tech School/Training Cent.' 'Place of Worship'
 'Converted House' 'Industrial Manufacturing Plant' 'Duplex/Semi-Detached'
 'Medical/Dental Of

In [513]:
# removing statuses that aren't needed in analysis
statuses_to_remove = [
    "Pending Cancellation",
    "Application Withdrawn",
    "Superseded",
    "Refused",
    "Abandoned",
    "Not Accepted",
    "VIOLATION",
    "Work Suspended",
    "Refusal Notice",
    "Cancelled"
]

combined_buildingPermits = combined_buildingPermits[~combined_buildingPermits['STATUS'].isin(statuses_to_remove)]


In [514]:
# dropping STRUCTURE_TYPES not useful to analysis
# STRUCTURE TYPE
not_included = [
    'Office', 'Industrial Processing Plant', 'Gas Station/Car Wash/Repair Garage', 'Personal Service Shop', 'Industrial Manufacturing Plant', 'SFD: P/D/F/E/R Drains', 'Industrial Warehouse/Hazardous Building', 
    'Electromagnetic Locks', 'HVAC Alt. add on Sys. or Ductwork Alt.', 'Nursing Home Facility',
    'Home for the Aged', 'Retaining Wall', 'Industrial', 'SFD Garages', 'Piping(all other bldgs):Outside Water..',
    'P/D/F/E/R Drains: all other buildings', 'Laboratory', 'Water and Sewage Pumping Stations', 'Warehouse', 'Parking Garage Repairs (all other)', 'Grandstand',
    'Residential Porches', 'Residential Decks', 'Repair Garage', 'Converted House', 'Industrial - Shell',
    'Laundromat', 'Third Party', 'Storage Room', 'Convent/Monastery', 'Police Station with Detention', 'Manufacturing - MMPF ',
    'Manufacturing - MMPF', 'Undertaking Premises', 'Crematorium/Cemetary Structure', 'Jails/Detention Facility', 
    'Self-Service Storage Building', 'Triplex/Semi-Detached', 'Courtroom', 'Distillery', 'Car Dealership', 'Power Plant',
    'Dry Cleaning/Laundry Plant', 'Printing Plant', 'Dry Cleaning Depot', 'Long Term Care Facility', 'Tree  Declaration Form',
    'Police Station with No Detention', 'Live/Work Unit', 'Unknown', 'Group D & E', 'SFD Access. Structures',
    'Fire Alarms', 'Standpipes', 'Piping(SF) Water Serv., Sanitary/Storm', 'Fireplaces', 'Registration and Discharge  of Unsafe Order', 'Farm Building',
 'MGO Memo To ', 'HVAC Alt. Boiler/Furn Rplmt. or A/C', 'Exterior Storage Tank', 'Canopy w/o enclosure', 'Sprinklers',
    'Underpinning', 'Spray Painting Operation', 'Group F (< 230 m2)', 'Piping(SF):Repair/Rplmt/Add. Pool Drain',
    'HVAC: Special Ventilation System', 'Basements - Finishing - in Dwellings/TH', 'Mixed Comm/Inst./Res',
    'SFD/TH HVAC', 'Balcony Repairs', 'Repairs/Re-cladding Walls, Re-roofing', 'Temporary Buildings',
    'Trailers', 'Parking Garage Repairs (slab)', 'Sales Pavilions', 'Mixed Industrial Use',
    'SFD/TH Heat. Vent. only', 'Commercial/Institutional Use', 'Industrial/Institutional Use', 'Tent',
    'Mixed Assembly Use', 'Window Replacements (except SFD)', 'Communication Tower', 'SFD/TH Boiler/Furn. Replac.',
    'Residential Carports', 'Group F (> 230m2)', 'Portable Classroom',
    'Re-roofing with structural work', 'Multiple-Use Building', 'Mixed Institutional', 'Fire Doors Retrofit', 'Piping(all other bldgs):Inside San/Storm', 'Exhibition Hall(With Sales)', 'Exhibition Hall(Without Sales)', 'Mixed Inst/Res', 'Other School',
    'House', 'Mixed Comm/Inst/Ind/Res', 'Air Supported Stuctures', 'Home Office', 'Mixed Ind/Comm/Res',
    'Industrial Chemical Plant', 'Municipal Shelter', 'Penthouse/Mechanical Room', 'Lecture Hall', 'Subdivision', 'Public Health', 'ZR - Licensing LPR Notice',
    'HAP Folder', "ZR - Examiner's Notice", 'Tree Declaration Form', 'Municipal Road Damage Deposit Form',
    'Demolition Permit Application Checklist',
    'Parks Levy Appraisal Request', 'Registration and Discharge of Unsafe Order', 'ZR Folder - Planning Source',
    'MGO Memo To', 'Search Titles', 'HP Property DM Folder', 'Sump Pump Program', 'Supermarket',
    'Laneway / Rear Yard Suite', 'Toronto Fire Notifications', 'Laneway / Rear Yard Suites',
    'Television Studio(with audience)', 'HVAC for other Group C', 'Backflow Prevention Devices',
    'Manholes, Catch Basin, Interceptors, Smp', 'SFD/TH A/C Unit Addition', 'Tent (permits for certified)',
    'Balcony Guards'
]

combined_buildingPermits = combined_buildingPermits[~combined_buildingPermits['STRUCTURE_TYPE'].isin(not_included)]


In [515]:
# removing work types not needed in analysis
no_work_types = [
    'Install/Alter Plumbing - only', 'Demolition', 'Partial Demolition',
    'Partial Permit - Shoring', ' Fixtures/Roof Drains: SFD', 'Addition(s) ', 'Porch', 'Garage', 'Deck',
    'Install/Alter HVAC - only', 'Alter:  Add on /Ductwork', 'Balcony/Guard Repairs',
    'Building Permit Related(PS)', 'Window Replacement', 'Re-Roofing/Re-Cladding', 'Special Ventilation System',
    'Other Proposal', 'Carport', 'Change of Use', 'HVAC: Groups  D & E', ' Fixtures/Roof Drains: Other ', 'Finishing Basements',
    'Fixtures/Roof Drains: Other', 'Certified Portables', 'Non-Certified Portables', 'MGO 565 Remediation',
    'Multiple Projects', 'New Laneway / Rear Yard Suite', ' Backflow Prevention Devices (Water only)',
    'Fixtures/Roof Drains: SFD', 'Septic System:  Sewage System',
    'Communication Tower', 'City Planning',
    'Building Permit Related(MS)',
    'Partial Permit - Foundation',
    'Electromagnetic Locks',
    'Alter: Add on /Ductwork',
    'Other(BA)',
    'HVAC: Parking Garages',
    'Canopy',
    'Other(SR)',
    'Fire Damage',
    'Walk-Out Stair',
    'HVAC: Groups A & B',
    'Piping: SFD/Semi',
    'Fire Alarm',
    'Air Conditioning: SFD/Semi/TH',
    'Garage Repair/Reconstruction',
    'Piping: Other Buildings',
    'Sprinklers',
    'Manholes/Catch Basins/Sumps/Interceptors',
    'Underpinning',
    'HVAC: Groups D & E',
    'Partial Permit - Structural Framing',
    'Certified Tents',
    'Other(DS)',
    'Building Permit Related (DR)',
    'Heat/Ventilation: SFD/Semi/TH',
    'Canopy w/o Enclosure',
    'Sales Pavilions',
    'Interior Demolition',
    'Site Service',
    'Septic System: Sewage System',
    'Other(PS)',
    'Boiler/Furnace: SFD/Semi/TH',
    'Other(FS)',
    'Other(MS)',
    'Inside and Outside Drains',
    'Other Temporary Tents',
    'Building Permit Related(FS)',
    'Other(TS)',
    'Alter: Boiler/Furnace/AC Replacement',
    'Shoring',
    'Backflow Prevention Devices (Water only)',
    'Temporary Structures',
    'HVAC: SFD/Semi/TH',
    'Retaining Wall',
    'HVAC: Other Group C Buildings',
    'Emergency Lighting',
    'Solar Domestic Hot Water (Res)',
    'Sign Building Permit Related',
    'Crane Runway',
    'Alternative Solution',
    'Solar Collector',
    'Standpipes',
    'Party Wall Admin Permits',
    'Back Water Valve (Sewer only)',
    'HVAC: Group F > 230 Sq M',
    'Pool Fence Enclosure',
    'Fire Doors Retrofit',
    'Exterior Tank & Support',
    'Fireplace/Wood Stoves',
    'Trailers',
    'HVAC: Group F up to 230 Sq M',
    'Material Evaluation',
    'Unknown',
    'HVAC',
    'Install/Alter Plumbing & HVAC only',
    'Accessory Structure',
    'Partial Permit - Other',
    'Addition',
    'Satellite Dish',
    'Pedestrian Bridge',
    'Holding Tank: Sewage System',
    'HVAC: Laboratories',
    'Ceilings (Add or Replace)',
    'Other'
]

combined_buildingPermits = combined_buildingPermits[~combined_buildingPermits['WORK'].isin(no_work_types)]


In [516]:
print("STRUCTURE TYPES:")
unique_structure_types = combined_buildingPermits['STRUCTURE_TYPE'].unique()
print(unique_structure_types)

STRUCTURE TYPES:
['SFD - Detached' 'SFD - Semi-Detached' 'Group A & B'
 'Restaurant Greater Than 30 Seats' ' Mixed Comm/Res '
 'Commercial/Industrial Use' 'Multiple Unit Building'
 'Open Public Swimming Pool' 'Mixed Use/Res w Non Res' 'Elementary School'
 'Secondary School' 'Hospital' 'Apartment Building'
 'Hair, Barber and Other Salon' 'SFD - Townhouse' 'Place of Worship'
 'Retail Store' 'Medical/Dental Office' 'Restaurant 30 Seats or Less '
 'Parking Garage' 'Bank' 'College/Trade/Tech School/Training Cent.'
 'Multiple Use/Non Residential' nan 'Museum' 'Other' 'Motel/Hotel'
 'Performing Arts Centre' 'Fitness Centre' 'Club'
 'Rental and Service Establishment' 'Community Hall' 'Stacked Townhouses'
 'Recreational' 'Motion Picture Theatre' 'Library' 'University' 'Triplex'
 'Transit Station,Subway, Bus Terminal'
 'Child Care Facility/DayCare Centre' 'Duplex/Semi-Detached'
 '3+ Unit - Semi-detached' 'Apartment Hotel' 'Retail Mall/Plaza'
 'Art Gallery' 'Duplex ' '2 Unit - Detached' '2 Unit -

In [517]:
Amenities_Structure_Types_Sample = [
    'Performing Arts Centre',
    'Fitness Centre',
    'Club',
    'Motel/Hotel'
    'Library',
    'Art Gallery',
    'Hair, Barber and Other Salon',
    'Restaurant 30 Seats or Less',
    'Museum',
    'Retail Mall/Plaza',
    'Indoor Swimming Pool',
    'Television Studio(no audience)',
    'Gymnasium',
    'Amusement Park Structure',
    'Dance Hall',
    'Auditorium',
    'Stadium',
    'Radio Station',
    'Recreational'
]


Transporation_Structure_Types_Sample = ['Transit Station,Subway, Bus Terminal']

Social_Development = [
    'Elementary School', 
    'College/Trade/Tech School/Training Cent.', 
    'Place of Worship', 
    'University', 
    'Secondary School', 
    'Hospital', 
    'Student Residence', 
    'Child Care Facility/DayCare Centre'
]

real_estate_structure_types = [
    'SFD - Detached',
    'SFD - Semi-Detached',
    'Group A & B',
    'Mixed Comm/Res',
    'Multiple Unit Building',
    'Mixed Use/Res w Non Res',
    'Apartment Building',
    'Parking Garage',
    'Multiple Use/Non Residential',
    'Stacked Townhouses',
    'Triplex',
    'Duplex/Semi-Detached',
    '3+ Unit - Semi-detached',
    'Apartment Hotel',
    'Duplex',
    '2 Unit - Detached',
    '2 Unit - Semi-detached',
    'Boarding/Lodging House',
    '2 Unit - Townhouse',
    '3+ Unit - Detached',
    'Non-Residential Building',
    'Other (New Housing)',
    '3+ Unit - Townhouse'
]




In [518]:
print("WORK TYPES:")
unique_work_types = combined_buildingPermits['WORK'].unique()
print(unique_work_types)


WORK TYPES:
['Addition to Existing Building'
 'Addition/Alteration to Existing Building' 'New Building'
 'Alteration to Existing Building' 'Interior Alterations'
 'Accessory Building(s)' nan 'Second Suite (New)' 'New Building-Certified'
 'New Building - Lead' 'MGO Remediation' 'New Building Certified - Lead'
 'Green Roof' 'New Building - By Renovation']


In [519]:

# changing date types
combined_buildingPermits['ISSUED_DATE'] = pd.to_datetime(combined_buildingPermits['ISSUED_DATE'])
combined_buildingPermits['APPLICATION_DATE'] = pd.to_datetime(combined_buildingPermits['APPLICATION_DATE'])

# changing int types
combined_buildingPermits['EST_CONST_COST'] = combined_buildingPermits['EST_CONST_COST'].str.replace(',', '')  # Remove commas
combined_buildingPermits['EST_CONST_COST'] = pd.to_numeric(combined_buildingPermits['EST_CONST_COST'], errors='coerce')
combined_buildingPermits['EST_CONST_COST'] = combined_buildingPermits['EST_CONST_COST'].fillna(0).astype(int)
combined_buildingPermits['GEO_ID'] = pd.to_numeric(combined_buildingPermits['GEO_ID'], errors='coerce').astype('Int64')

# changing string types
combined_buildingPermits['PERMIT_NUM'] = combined_buildingPermits['PERMIT_NUM'].astype(str)
combined_buildingPermits['PERMIT_TYPE'] = combined_buildingPermits['PERMIT_TYPE'].astype(str)
combined_buildingPermits['WORK'] = combined_buildingPermits['WORK'].astype(str)
combined_buildingPermits['STREET_NAME'] = combined_buildingPermits['STREET_NAME'].astype(str)
combined_buildingPermits['STREET_TYPE'] = combined_buildingPermits['STREET_TYPE'].astype(str)
combined_buildingPermits['STREET_NUM'] = combined_buildingPermits['STREET_NUM'].astype(str)
combined_buildingPermits['STREET_DIRECTION'] = combined_buildingPermits['STREET_DIRECTION'].astype(str)
combined_buildingPermits['POSTAL'] = combined_buildingPermits['POSTAL'].astype(str)
combined_buildingPermits['WARD_GRID'] = combined_buildingPermits['WARD_GRID'].astype(str)
combined_buildingPermits['DESCRIPTION'] = combined_buildingPermits['DESCRIPTION'].astype(str)
combined_buildingPermits['STATUS'] = combined_buildingPermits['STATUS'].astype(str)

# change format of column names
combined_buildingPermits.columns = ['_'.join(word.capitalize() for word in col.split('_')) for col in combined_buildingPermits.columns]


In [520]:
# handling null values
columns_with_null = combined_buildingPermits.isnull().any()
print(columns_with_null)

Permit_Num          False
Permit_Type         False
Structure_Type       True
Work                False
Street_Num          False
Street_Name         False
Street_Type         False
Street_Direction    False
Postal              False
Geo_Id               True
Ward_Grid           False
Application_Date    False
Issued_Date          True
Completed_Date       True
Status              False
Description         False
Est_Const_Cost      False
dtype: bool


In [521]:
# dont include any records where Structure_Type is null
combined_buildingPermits = combined_buildingPermits.dropna(subset=['Structure_Type'])

In [522]:
# dont include any records where Application_Date or Issued_Date is null
combined_buildingPermits = combined_buildingPermits.dropna(subset=['Application_Date'])
combined_buildingPermits = combined_buildingPermits.dropna(subset=['Issued_Date'])

In [523]:
# handling duplicate values for Permit_Num
print("number of rows before removing duplicates:", len(combined_buildingPermits))

# Calculate the number of duplicate Permit_Num values 
number_of_duplicates = combined_buildingPermits['Permit_Num'].duplicated().sum()
print("duplicate permit ids:", number_of_duplicates)

# Remove duplicates by keeping only the first occurrence of each 'Permit_Num'
combined_buildingPermits = combined_buildingPermits.drop_duplicates(subset='Permit_Num', keep='first')

print("number of rows after removing duplicates:", len(combined_buildingPermits))

number of rows before removing duplicates: 90480
duplicate permit ids: 16710
number of rows after removing duplicates: 73770


In [524]:
# filtering to find sample data that paints a more concentrated image for gentrification analysis

amenities_BuildingPermits = combined_buildingPermits[combined_buildingPermits['Structure_Type'].isin(Amenities_Structure_Types_Sample)&
    (combined_buildingPermits['Est_Const_Cost'] > 500)]
print(len(amenities_BuildingPermits))

transporation_BuildingPermits = combined_buildingPermits[combined_buildingPermits['Structure_Type'].isin(Transporation_Structure_Types_Sample)&
    (combined_buildingPermits['Est_Const_Cost'] > 500)]
print(len(transporation_BuildingPermits))

social_development_BuildingPermits = combined_buildingPermits[combined_buildingPermits['Structure_Type'].isin(Social_Development)&
    (combined_buildingPermits['Est_Const_Cost'] > 500)]
print(len(social_development_BuildingPermits))

realestate_BuildingPermits = combined_buildingPermits[
    combined_buildingPermits['Structure_Type'].isin(real_estate_structure_types) &
    (combined_buildingPermits['Est_Const_Cost'] > 500000)
]
print(len(realestate_BuildingPermits))

710
379
3026
7782


In [525]:
# exporting samples to csv

# # Export amenities_BuildingPermits to CSV
# amenities_BuildingPermits.to_csv('amenities_sample.csv', index=False)

# # Export transporation_BuildingPermits to CSV
# transporation_BuildingPermits.to_csv('transporation_sample.csv', index=False)

# # Export social_development_BuildingPermits to CSV
# social_development_BuildingPermits.to_csv('social_development_sample.csv', index=False)

# # Export realestate_BuildingPermits to CSV
# realestate_BuildingPermits.to_csv('realestate_sample.csv', index=False)

combined_buildingPermits = pd.concat([amenities_BuildingPermits, transporation_BuildingPermits, social_development_BuildingPermits, realestate_BuildingPermits], ignore_index=True)
print(len(combined_buildingPermits))

11897


In [526]:
# adding Ward_ID column to populate later once geospatial analysis is complete
combined_buildingPermits['Ward_ID'] = None

In [527]:
# generating surrogate keys
combined_buildingPermits['Permit_Key'] = range(1, len(combined_buildingPermits) + 1)

# Move 'Permit_Key' to the first position
cols = ['Permit_Key'] + [col for col in combined_buildingPermits.columns if col != 'Permit_Key']

# Reorder the DataFrame
combined_buildingPermits = combined_buildingPermits[cols]

In [528]:
combined_buildingPermits[:10]

Unnamed: 0,Permit_Key,Permit_Num,Permit_Type,Structure_Type,Work,Street_Num,Street_Name,Street_Type,Street_Direction,Postal,Geo_Id,Ward_Grid,Application_Date,Issued_Date,Completed_Date,Status,Description,Est_Const_Cost,Ward_ID
0,1,00 330311 CMB,Non-Residential Building Permit,"Hair, Barber and Other Salon",Addition/Alteration to Existing Building,3360,YONGE,ST,,M4N,7793316.0,N1603,2000-08-22,2002-06-13,2016-10-27,Closed,Build second storey addition at rear. See prev...,10000,
1,2,02 151529 BLD,Building Additions/Alterations,Museum,Interior Alterations,100,GARRISON,RD,,M5V,807841.0,S1909,2002-06-27,2002-06-27,2016-07-25,Closed,Interior alterations,60000,
2,3,04 111348 BLD,Building Additions/Alterations,Performing Arts Centre,Interior Alterations,6,NOBLE,ST,,M6K,8133590.0,S1405,2004-02-24,2004-05-27,2016-01-19,Closed,Interior Alterations to Industrial space to cr...,650000,
3,4,04 198435 BLD,Building Additions/Alterations,Fitness Centre,Interior Alterations,5485-5487,DUNDAS,ST,W,M9B,,W0504,2004-12-07,2005-02-02,2016-01-29,Closed,Interior Alterations to convert a vacant unit ...,10000,
4,5,05 115473 BLD,New Building,Club,New Building,2,MUGGS ISLAND,PK,,,14635094.0,S2814,2005-03-15,2005-06-10,2016-06-08,Closed,New clubhouse for yacht club,1400000,
5,6,06 195998 BLD,Building Additions/Alterations,Fitness Centre,Interior Alterations,3401,DUFFERIN,ST,,M6A,509769.0,N1502,2006-12-05,2006-12-21,2016-12-30,Closed,Premier Fitness - washroom renovation,35000,
6,7,06 197901 BLD,Building Additions/Alterations,Performing Arts Centre,Interior Alterations,6,NOBLE,ST,,M6K,8133590.0,S1405,2006-12-14,2007-03-12,2016-01-19,Closed,Interior alterations to dance studio. See BLD...,1000,
7,8,07 160282 BLD,Building Additions/Alterations,Recreational,Interior Alterations,50,WABASH,AVE,,M6R,11226744.0,S1403,2007-04-18,2007-05-02,2016-01-22,Closed,Interior alterations to existing 2 sty industr...,300000,
8,9,09 124927 BLD,Building Additions/Alterations,Club,Interior Alterations,100,INDIAN,RD,,M6R,8006978.0,S1404,2009-04-08,2009-04-28,2016-08-03,Closed,Interior alterations to existing basemen washr...,40000,
9,10,10 271932 BLD,Building Additions/Alterations,"Hair, Barber and Other Salon",Interior Alterations,180,QUEENS PLATE,DR,,M9W,9655496.0,W0202,2010-10-05,2011-01-19,2016-12-20,Closed,"Interior alteration for a nail salon, ""Rexdale...",50000,


In [529]:
# generating surrogate keys
demolition['Demolition_Key'] = demolition.index + 1

# Move 'Permit_Key' to the first position
cols = ['Demolition_Key'] + [col for col in demolition.columns if col != 'Demolition_Key']

# Reorder the DataFrame
demolition = demolition[cols]

In [530]:

# changing date types
demolition['City Council Approval Date'] = pd.to_datetime(demolition['City Council Approval Date'])

# changing string types
demolition['IBMS Address'] = demolition['IBMS Address'].astype(str)
demolition['Address of Existing Rental Building'] = demolition['Address of Existing Rental Building'].astype(str)
demolition['RH File Number'] = demolition['RH File Number'].astype(str)
demolition['Type'] = demolition['Type'].astype(str)

# changing int types
columns = [
    'Total Rental Homes for Demolition', 
    'Affordable Rental Homes for Demolition', 
    'Mid-Range Rental Homes for Demolition', 
    'High-End Rental Homes for Demolition', 
    'Total Rental Homes Replaced', 
    'Affordable Rental Homes Replaced', 
    'Mid-Range Rental Homes Replaced', 
    'High-End Rental Homes Replaced'
]

for column in columns:
    demolition[column] = pd.to_numeric(demolition[column], errors='coerce').fillna(0).astype('Int64')



In [531]:
demolition[:10]

Unnamed: 0,Demolition_Key,IBMS Address,Address of Existing Rental Building,RH File Number,City Council Approval Date,Type,Total Rental Homes for Demolition,Affordable Rental Homes for Demolition,Mid-Range Rental Homes for Demolition,High-End Rental Homes for Demolition,Total Rental Homes Replaced,Affordable Rental Homes Replaced,Mid-Range Rental Homes Replaced,High-End Rental Homes Replaced
0,1,75 MUTUAL ST,"75, 77, 83 MUTUAL ST",14 183555 STE 27 RH,2017-01-31,Demolition - 6 Rental Units or More,22,0,0,0,22,21,1,0
1,2,1996 BATHURST ST,"1996, 1998 and 2000 Bathurst Street",15 270570 STE 21 RH,2017-03-09,Demolition - 6 Rental Units or More,62,61,1,0,62,61,1,0
2,3,125 PARLIAMENT ST,125 Parliament Street and 50 Power Street,13 181698 STE 28 RH,2017-05-24,Demolition - 6 Rental Units or More,15,12,3,0,15,12,3,0
3,4,480 YONGE ST,"3 Grosvenor Street (492-494 Yonge Street), 484...",14 267359 STE 27 RH,2017-05-24,Demolition - 6 Rental Units or More,26,15,6,5,26,15,6,5
4,5,30 ERSKINE AVE,30 Erskine Avenue,13 116882 NNY 25 RH,2017-07-04,Demolition - 6 Rental Units or More,6,0,6,0,6,0,6,0
5,6,59 MUTUAL ST,"59, 63, 65, 67, 69 and 71 Mutual Street",16 101809 STE 27 RH,2017-07-04,Demolition - 6 Rental Units or More,12,7,4,1,12,7,4,1
6,7,703 SOUDAN AVE,"1674 Bayview Avenue, 703 and 707 Soudan Avenue...",14 227490 STE 22 RH,2017-07-04,Demolition - 6 Rental Units or More,38,14,21,3,38,14,21,3
7,8,40 MOCCASIN TRL,"50 and 60 Green Belt Drive, and 40 Moccasin Trail",13 173075 NNY 34 RH,2017-07-04,Demolition - 6 Rental Units or More,83,59,24,0,67,43,24,0
8,9,2525 BATHURST ST,2525 Bathurst Street,16 152046 NNY 16 RH,2017-10-02,Demolition - 6 Rental Units or More,33,6,26,1,33,6,26,1
9,10,650 KINGSTON RD,650-652 Kingston Road and 2 Main Street,15 215445 STE 32 RH,2017-11-07,Demolition - 6 Rental Units or More,11,7,4,0,11,7,4,0


In [532]:
# determining start and ends for date dimension
earliest_date_completed = combined_buildingPermits['Completed_Date'].min()
latest_date_completed = combined_buildingPermits['Completed_Date'].max()

earliest_date_issued = combined_buildingPermits['Issued_Date'].min()
latest_date_issued = combined_buildingPermits['Issued_Date'].max()

earliest_date_applied = combined_buildingPermits['Application_Date'].min()
latest_date_applied = combined_buildingPermits['Application_Date'].max()

earliest_date_approval = demolition['City Council Approval Date'].min()
latest_date_approval = demolition['City Council Approval Date'].max()

print(f"Earliest Completed Date: {earliest_date_completed}")
print(f"Latest Completed Date: {latest_date_completed}")
print(f"Earliest Issued Date: {earliest_date_issued}")
print(f"Latest Issued Date: {latest_date_issued}")
print(f"Earliest Applied Date: {earliest_date_applied}")
print(f"Latest Applied Date: {latest_date_applied}")
print(f"Earliest Approved Date: {earliest_date_approval}")
print(f"Latest Approved Date: {latest_date_approval}")


Earliest Completed Date: 2016-01-04 00:00:00
Latest Completed Date: 2024-03-07 00:00:00
Earliest Issued Date: 1999-06-10 00:00:00
Latest Issued Date: 2024-03-07 00:00:00
Earliest Applied Date: 1998-08-14 00:00:00
Latest Applied Date: 2024-02-22 00:00:00
Earliest Approved Date: 2017-01-31 00:00:00
Latest Approved Date: 2023-12-13 00:00:00


In [533]:
dates = {
    "Earliest Completed Date",
    "Latest Completed Date",
    "Earliest Issued Date",
    "Latest Issued Date",
    "Earliest Applied Date",
    "Latest Applied Date",
    "Earliest Approved Date",
    "Latest Approved Date"
}

In [534]:
# generate date range with minimum date and maximum date to cover the entire range
start_date = '1998-08-14'
end_date = '2024-03-07'

dates = pd.date_range(start=start_date, end=end_date)

In [535]:
DateDimension = pd.DataFrame(dates, columns=['Date'])

# Extract date parts
DateDimension['Year'] = DateDimension['Date'].dt.year
DateDimension['Month'] = DateDimension['Date'].dt.month
DateDimension['Day'] = DateDimension['Date'].dt.day
DateDimension['Quarter'] = DateDimension['Date'].dt.quarter
DateDimension['WeekOfYear'] = DateDimension['Date'].dt.isocalendar().week
DateDimension['DayOfWeek'] = DateDimension['Date'].dt.dayofweek
DateDimension['DayName'] = DateDimension['Date'].dt.day_name()
DateDimension['IsWeekend'] = DateDimension['DayOfWeek'].isin([5, 6]).astype(int)

In [536]:
# generating surrogate keys
DateDimension['Date_Key'] = DateDimension.index + 1

# Move 'Permit_Key' to the first position
cols = ['Date_Key'] + [col for col in DateDimension.columns if col != 'Date_Key']

# Reorder the DataFrame
DateDimension = DateDimension[cols]

In [537]:
DateDimension[:10]

Unnamed: 0,Date_Key,Date,Year,Month,Day,Quarter,WeekOfYear,DayOfWeek,DayName,IsWeekend
0,1,1998-08-14,1998,8,14,3,33,4,Friday,0
1,2,1998-08-15,1998,8,15,3,33,5,Saturday,1
2,3,1998-08-16,1998,8,16,3,33,6,Sunday,1
3,4,1998-08-17,1998,8,17,3,34,0,Monday,0
4,5,1998-08-18,1998,8,18,3,34,1,Tuesday,0
5,6,1998-08-19,1998,8,19,3,34,2,Wednesday,0
6,7,1998-08-20,1998,8,20,3,34,3,Thursday,0
7,8,1998-08-21,1998,8,21,3,34,4,Friday,0
8,9,1998-08-22,1998,8,22,3,34,5,Saturday,1
9,10,1998-08-23,1998,8,23,3,34,6,Sunday,1


In [538]:
# Create a date to Date_Key mapping
date_to_date_key = pd.Series(DateDimension['Date_Key'].values, index=DateDimension['Date']).to_dict()

# Map each date in combined_buildingPermits to a Date_Key
combined_buildingPermits['Application_Date_Key'] = combined_buildingPermits['Application_Date'].map(date_to_date_key).astype('Int64')
combined_buildingPermits['Issued_Date_Key'] = combined_buildingPermits['Issued_Date'].map(date_to_date_key).astype('Int64')
combined_buildingPermits['Completed_Date_Key'] = combined_buildingPermits['Completed_Date'].map(date_to_date_key).astype('Int64')

# Map Approval_Date in demolition_dimension to a Date_Key
demolition['Approval_Date_Key'] = demolition['City Council Approval Date'].map(date_to_date_key)

In [539]:
development_fact_table = combined_buildingPermits[['Permit_Key', 'Application_Date_Key', 'Issued_Date_Key', 'Completed_Date_Key']].copy()

# Print the first 25 rows of the development_fact_table, ensuring for each Permit_key, there is associated keys for each date field
print(development_fact_table[:20])

    Permit_Key  Application_Date_Key  Issued_Date_Key  Completed_Date_Key
0            1                   740             1400                6650
1            2                  1414             1414                6556
2            3                  2021             2114                6368
3            4                  2308             2365                6378
4            5                  2406             2493                6509
5            6                  3036             3052                6714
6            7                  3045             3133                6368
7            8                  3170             3184                6371
8            9                  3891             3911                6565
9           10                  4436             4542                6704
10          11                  4670             4682                6523
11          12                  4871             4919                6367
12          13                  4996  

In [540]:
# Count and print the number of empty rows for 'Completed_Date_Key'
empty_completed_date_key_count = development_fact_table['Completed_Date_Key'].isna().sum()
print(f"Number of empty rows for 'Completed_Date_Key': {empty_completed_date_key_count}")



Number of empty rows for 'Completed_Date_Key': 6919


In [541]:
# Filter the DataFrame to only include rows where 'Completed_Date_Key' is NaN
empty_completed_date_rows = development_fact_table[development_fact_table['Completed_Date_Key'].isna()]

# Print the rows with empty 'Completed_Date_Key'
print(empty_completed_date_rows)


      Permit_Key  Application_Date_Key  Issued_Date_Key  Completed_Date_Key
37            38                   568              649                <NA>
38            39                   607              683                <NA>
39            40                   615              621                <NA>
40            41                   630              748                <NA>
41            42                   634              679                <NA>
...          ...                   ...              ...                 ...
8864        8865                  9310             9332                <NA>
8865        8866                  9311             9337                <NA>
8866        8867                  9322             9337                <NA>
8867        8868                   420              820                <NA>
8868        8869                   441              664                <NA>

[6919 rows x 4 columns]


In [542]:
# create a mapping from Date_Key to actual Date
date_key_to_date = pd.Series(DateDimension['Date'].values, index=DateDimension['Date_Key']).to_dict()

# Calculate the duration in days between Application Date and Issued Date directly using the mapping
development_fact_table['Application_to_Issuance_Duration'] = (
    development_fact_table['Issued_Date_Key'].map(date_key_to_date).sub(
    development_fact_table['Application_Date_Key'].map(date_key_to_date))
).dt.days

print(development_fact_table[:20])


    Permit_Key  Application_Date_Key  Issued_Date_Key  Completed_Date_Key  \
0            1                   740             1400                6650   
1            2                  1414             1414                6556   
2            3                  2021             2114                6368   
3            4                  2308             2365                6378   
4            5                  2406             2493                6509   
5            6                  3036             3052                6714   
6            7                  3045             3133                6368   
7            8                  3170             3184                6371   
8            9                  3891             3911                6565   
9           10                  4436             4542                6704   
10          11                  4670             4682                6523   
11          12                  4871             4919                6367   

In [543]:
# Create a temporary DataFrame to work with the years and durations
temp_df = development_fact_table.copy()

# Use the mapping to add the year information directly within the calculation step
temp_df['Year'] = temp_df['Application_Date_Key'].map(date_key_to_date).dt.year

# Calculate the average duration by year
avg_duration_by_year = temp_df.groupby('Year')['Application_to_Issuance_Duration'].mean().reset_index(name='Avg_App_to_Issuance_Duration_by_Year')

# merge this average back with the original fact table based on the application year 
development_fact_table['Application_Year_Temp'] = development_fact_table['Application_Date_Key'].map(date_key_to_date).dt.year

# Merge using this temporary year column
development_fact_table = pd.merge(development_fact_table,
                     avg_duration_by_year,
                     left_on='Application_Year_Temp',
                     right_on='Year',
                     how='left')

# Drop the temporary columns after the merge
development_fact_table.drop(['Application_Year_Temp', 'Year'], axis=1, inplace=True)

# Print or use your final table as needed
print(development_fact_table.head())




   Permit_Key  Application_Date_Key  Issued_Date_Key  Completed_Date_Key  \
0           1                   740             1400                6650   
1           2                  1414             1414                6556   
2           3                  2021             2114                6368   
3           4                  2308             2365                6378   
4           5                  2406             2493                6509   

   Application_to_Issuance_Duration  Avg_App_to_Issuance_Duration_by_Year  
0                               660                            137.988095  
1                                 0                            100.780488  
2                                93                            280.966443  
3                                57                            280.966443  
4                                87                            128.469388  


In [544]:
print(development_fact_table[:20])

    Permit_Key  Application_Date_Key  Issued_Date_Key  Completed_Date_Key  \
0            1                   740             1400                6650   
1            2                  1414             1414                6556   
2            3                  2021             2114                6368   
3            4                  2308             2365                6378   
4            5                  2406             2493                6509   
5            6                  3036             3052                6714   
6            7                  3045             3133                6368   
7            8                  3170             3184                6371   
8            9                  3891             3911                6565   
9           10                  4436             4542                6704   
10          11                  4670             4682                6523   
11          12                  4871             4919                6367   