In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
# load dataset
df = pd.read_csv("gta_housing_synthetic.csv")
df.head(12)

Unnamed: 0,price,bedrooms,bathrooms,sqft,house_type,year_built,neighbourhood,lot_size,garage,distance_to_downtown_km
0,1954952,4,5,1727,Townhouse,1985,North York,6307,1,9.1
1,1954952,4,5,1727,Townhouse,1985,North York,6307,1,9.1
2,1759465,3,3,2808,Detached,1986,Mississauga,7706,0,5.4
3,2269001,5,1,3691,Detached,1950,Scarborough,4655,0,24.9
4,1590051,5,3,1409,Detached,1964,Scarborough,5173,0,20.8
5,1119006,2,3,1103,Semi-Detached,1986,Mississauga,1909,1,4.2
6,1350224,3,5,974,Detached,1962,Mississauga,5697,1,34.7
7,1887906,3,4,2008,Townhouse,1957,Dowtown,7952,1,17.6
8,1911478,3,2,3632,Condo,1954,Scarborough,55,0,18.7
9,2001456,5,4,1512,Semi-Detached,2009,Etobicoke,2672,2,33.0


In [4]:
# clean datasets
df.isnull().sum()

# remove duplicates
df.drop_duplicates(inplace=True)

# fix outliers
# ensuring the prices are within the rance of $100k and $5M
df = df[(df['price'] > 100000 ) & (df['price']< 5000000)]

# houses cannot be very old
df = df[(df['year_built'] > 1900) & (df['year_built'] < 2025)]

# all condos my have a lot size of 0 and no garage
df.loc[df['house_type'] == 'Condo', ['lot_size', 'garage']] = 0

# find the house age
df['house_age'] = 2025 - df['year_built']

df.head(12)


Unnamed: 0,price,bedrooms,bathrooms,sqft,house_type,year_built,neighbourhood,lot_size,garage,distance_to_downtown_km,house_age
0,1954952,4,5,1727,Townhouse,1985,North York,6307,1,9.1,40
2,1759465,3,3,2808,Detached,1986,Mississauga,7706,0,5.4,39
3,2269001,5,1,3691,Detached,1950,Scarborough,4655,0,24.9,75
4,1590051,5,3,1409,Detached,1964,Scarborough,5173,0,20.8,61
5,1119006,2,3,1103,Semi-Detached,1986,Mississauga,1909,1,4.2,39
6,1350224,3,5,974,Detached,1962,Mississauga,5697,1,34.7,63
7,1887906,3,4,2008,Townhouse,1957,Dowtown,7952,1,17.6,68
8,1911478,3,2,3632,Condo,1954,Scarborough,0,0,18.7,71
9,2001456,5,4,1512,Semi-Detached,2009,Etobicoke,2672,2,33.0,16
10,1551768,4,4,1087,Detached,2022,Scarborough,779,1,28.9,3


In [5]:

# downtown is spelt incorrectly
df['neighbourhood'].unique()
df['neighbourhood'] = df['neighbourhood'].replace({'Dowtown': 'Downtown'})
df

Unnamed: 0,price,bedrooms,bathrooms,sqft,house_type,year_built,neighbourhood,lot_size,garage,distance_to_downtown_km,house_age
0,1954952,4,5,1727,Townhouse,1985,North York,6307,1,9.1,40
2,1759465,3,3,2808,Detached,1986,Mississauga,7706,0,5.4,39
3,2269001,5,1,3691,Detached,1950,Scarborough,4655,0,24.9,75
4,1590051,5,3,1409,Detached,1964,Scarborough,5173,0,20.8,61
5,1119006,2,3,1103,Semi-Detached,1986,Mississauga,1909,1,4.2,39
...,...,...,...,...,...,...,...,...,...,...,...
995,2066024,6,4,2338,Condo,1961,North York,0,0,7.1,64
996,1596250,5,1,770,Semi-Detached,1992,Downtown,2849,1,3.6,33
997,1359802,1,1,2280,Condo,2010,North York,0,0,26.3,15
998,1245943,1,3,1252,Condo,1980,Downtown,0,0,18.6,45


In [6]:
# making sure that there are all unique values
df['house_type'].unique()


array(['Townhouse', 'Detached', 'Semi-Detached', 'Condo'], dtype=object)

In [7]:
# save the cleaned data to a csv
df.to_csv("gta_housing_clean.csv", index=False)