# Cleaning Dataset and filtering out only house sales in Toronto

In [1]:
import pandas as pd

In [2]:
# House Sales in Ontario and read it into pandas
home_sales_df = pd.read_csv("Resources/properties.csv")
home_sales_df.head()

Unnamed: 0.1,Unnamed: 0,Address,AreaName,Price ($),lat,lng
0,0,86 Waterford Dr Toronto,Richview,999888,43.679882,-79.544266
1,1,#80 - 100 BEDDOE DR Hamilton,Chedoke Park B,399900,43.25,-79.904396
2,2,213 Bowman Street Hamilton,Ainslie Wood East,479000,43.25169,-79.919357
3,3,102 NEIL Avenue Hamilton,Greenford,285900,43.227161,-79.767403
4,6,#1409 - 230 King St Toronto,Downtown,362000,43.651478,-79.368118


In [3]:
# Filtering out unwanted column
home_sales_df = home_sales_df[["AreaName","Price ($)","lat","lng","Address"]]
home_sales_df.head()

Unnamed: 0,AreaName,Price ($),lat,lng,Address
0,Richview,999888,43.679882,-79.544266,86 Waterford Dr Toronto
1,Chedoke Park B,399900,43.25,-79.904396,#80 - 100 BEDDOE DR Hamilton
2,Ainslie Wood East,479000,43.25169,-79.919357,213 Bowman Street Hamilton
3,Greenford,285900,43.227161,-79.767403,102 NEIL Avenue Hamilton
4,Downtown,362000,43.651478,-79.368118,#1409 - 230 King St Toronto


In [4]:
# Insert a City column by spliting it from Address column
city=[]
for address in home_sales_df["Address"]:
    city.append(address.rsplit(" ",1)[1])

home_sales_df["City"]=city
home_sales_df.head(20)

Unnamed: 0,AreaName,Price ($),lat,lng,Address,City
0,Richview,999888,43.679882,-79.544266,86 Waterford Dr Toronto,Toronto
1,Chedoke Park B,399900,43.25,-79.904396,#80 - 100 BEDDOE DR Hamilton,Hamilton
2,Ainslie Wood East,479000,43.25169,-79.919357,213 Bowman Street Hamilton,Hamilton
3,Greenford,285900,43.227161,-79.767403,102 NEIL Avenue Hamilton,Hamilton
4,Downtown,362000,43.651478,-79.368118,#1409 - 230 King St Toronto,Toronto
5,Old East York,1488000,43.686375,-79.328918,254A Monarch Park Ave Toronto,Toronto
6,Fairbank,25,43.691193,-79.461662,532 Caledonia Rd Toronto,Toronto
7,Central Hamilton,254900,43.258541,-79.867615,11 REBECCA Street Hamilton,Hamilton
8,Gourley,364900,43.216557,-79.904991,617 BRIGADOON Drive Hamilton,Hamilton
9,Red Hill,304900,43.215706,-79.794189,51 - 255 MOUNT ALBION Road Hamilton,Hamilton


In [5]:
# Sort for Toronto only homes
sort_home_sales = home_sales_df.City == "Toronto"
toronto_home_sales = home_sales_df[sort_home_sales]
toronto_home_sales.head()

Unnamed: 0,AreaName,Price ($),lat,lng,Address,City
0,Richview,999888,43.679882,-79.544266,86 Waterford Dr Toronto,Toronto
4,Downtown,362000,43.651478,-79.368118,#1409 - 230 King St Toronto,Toronto
5,Old East York,1488000,43.686375,-79.328918,254A Monarch Park Ave Toronto,Toronto
6,Fairbank,25,43.691193,-79.461662,532 Caledonia Rd Toronto,Toronto
11,Wallace Emerson,113,43.664101,-79.439751,47 Armstrong Ave Toronto,Toronto


In [9]:
# Clean up Address column by removing "Toronto"
toronto_home_sales["Address"] = toronto_home_sales.Address.str.replace(" Toronto" , "")
toronto_home_sales.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,AreaName,Price ($),lat,lng,Address,City
0,Richview,999888.0,43.679882,-79.544266,86 Waterford Dr,Toronto
4,Downtown,362000.0,43.651478,-79.368118,#1409 - 230 King St,Toronto
5,Old East York,1488000.0,43.686375,-79.328918,254A Monarch Park Ave,Toronto
6,Fairbank,25.0,43.691193,-79.461662,532 Caledonia Rd,Toronto
11,Wallace Emerson,113.0,43.664101,-79.439751,47 Armstrong Ave,Toronto


In [7]:
# Export the Toronto home sales data into a csv
toronto_home_sales.to_csv("output_data/toronto_home_prices.csv")