Pulling out the values needed to generate market trend plots

In [2]:
# Import
import pandas as pd

In [4]:
# Reading in cleaned csv as a DataFrame
df = pd.read_csv("output_data/clean_house_crime_data.csv")
df.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime
0,2,h,1480000.0,2.5,1.0,1.0,202.0,2016,2,1053.0
1,2,h,1035000.0,2.5,1.0,0.0,156.0,2016,3,1053.0
2,4,h,1600000.0,2.5,1.0,2.0,120.0,2016,3,1053.0
3,2,h,941000.0,2.5,1.0,0.0,181.0,2016,6,1053.0
4,3,h,1876000.0,2.5,2.0,0.0,245.0,2016,6,1053.0


# Line graph - Landsize

In [5]:
# Inspecting at min and max values for landsize
print(df['Landsize'].min())
print(df['Landsize'].max())

0.0
433014.0


In [6]:
# Creating bins and labels
bins = [-1, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400, 2600, 2800, 3000]
group_labels = ["0-200", "201-400", "401-600", "601-800", "801-1000", "1001-1200", "1201-1400", "1401-1600", "1601-1800", "1801-2000", "2001-2200", "2201-2400", "2401-2600", "2601-2800", "2801-3000"]


In [7]:
# Placing the data series into a new column inside of the DataFrame
df["landsize_group"]=pd.cut(df["Landsize"], bins, labels=group_labels)
df.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime,landsize_group
0,2,h,1480000.0,2.5,1.0,1.0,202.0,2016,2,1053.0,201-400
1,2,h,1035000.0,2.5,1.0,0.0,156.0,2016,3,1053.0,0-200
2,4,h,1600000.0,2.5,1.0,2.0,120.0,2016,3,1053.0,0-200
3,2,h,941000.0,2.5,1.0,0.0,181.0,2016,6,1053.0,0-200
4,3,h,1876000.0,2.5,2.0,0.0,245.0,2016,6,1053.0,201-400


In [8]:
# Grouping by landsize and counting how many houses fall into each of my bins
land_count_df = df.groupby(["landsize_group"]).count()
land_count_df

Unnamed: 0_level_0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime
landsize_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0-200,3966,3966,3966,3966,3966,3966,3966,3966,3966,3966
201-400,3285,3285,3285,3285,3285,3285,3285,3285,3285,3285
401-600,3819,3819,3819,3819,3819,3819,3819,3819,3819,3819
601-800,4473,4473,4473,4473,4473,4473,4473,4473,4473,4473
801-1000,1017,1017,1017,1017,1017,1017,1017,1017,1017,1017
1001-1200,311,311,311,311,311,311,311,311,311,311
1201-1400,108,108,108,108,108,108,108,108,108,108
1401-1600,60,60,60,60,60,60,60,60,60,60
1601-1800,49,49,49,49,49,49,49,49,49,49
1801-2000,33,33,33,33,33,33,33,33,33,33


In [10]:
# Grouping by landsize and finding the mean values for each variable
grouped_ls_df = round((df.groupby(["landsize_group"]).mean()),0)
grouped_ls_df

Unnamed: 0_level_0,Rooms,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime
landsize_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0-200,2.0,796182.0,7.0,1.0,1.0,73.0,2016.0,6.0,1337.0
201-400,3.0,1112400.0,10.0,2.0,1.0,296.0,2017.0,6.0,1078.0
401-600,3.0,1067106.0,13.0,2.0,2.0,523.0,2017.0,6.0,1170.0
601-800,4.0,1241597.0,14.0,2.0,2.0,681.0,2017.0,6.0,958.0
801-1000,4.0,1465269.0,14.0,2.0,2.0,877.0,2017.0,6.0,964.0
1001-1200,3.0,1336483.0,14.0,2.0,2.0,1073.0,2017.0,5.0,1046.0
1201-1400,3.0,1834727.0,12.0,2.0,2.0,1283.0,2017.0,6.0,1056.0
1401-1600,3.0,1550438.0,12.0,2.0,2.0,1499.0,2017.0,5.0,1186.0
1601-1800,3.0,1268006.0,11.0,1.0,2.0,1696.0,2017.0,5.0,1476.0
1801-2000,3.0,977136.0,11.0,2.0,1.0,1909.0,2017.0,5.0,896.0


In [11]:
# Prices as a list
grouped_ls_df["Price"].to_list()

[796182.0,
 1112400.0,
 1067106.0,
 1241597.0,
 1465269.0,
 1336483.0,
 1834727.0,
 1550438.0,
 1268006.0,
 977136.0,
 1191927.0,
 677952.0,
 808277.0,
 814200.0,
 837500.0]

# Line graph - Bedrooms

In [12]:
# Grouping by number of bedrooms, applying count aggregate
grouped_rooms_counts = df.groupby("Rooms").count()
grouped_rooms_counts

Unnamed: 0_level_0,Type,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime,landsize_group
Rooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,687,687,687,687,687,687,687,687,687,670
2,3886,3886,3886,3886,3886,3886,3886,3886,3886,3814
3,7815,7815,7815,7815,7815,7815,7815,7815,7815,7776
4,4022,4022,4022,4022,4022,4022,4022,4022,4022,4004
5,861,861,861,861,861,861,861,861,861,852
6,96,96,96,96,96,96,96,96,96,91
7,14,14,14,14,14,14,14,14,14,12
8,10,10,10,10,10,10,10,10,10,9
10,2,2,2,2,2,2,2,2,2,2
12,1,1,1,1,1,1,1,1,1,1


In [13]:
# Mean aggregate applied
grouped_rooms_df = round((df.groupby("Rooms").mean()),0)
grouped_rooms_df

Unnamed: 0_level_0,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime
Rooms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,435319.0,6.0,1.0,1.0,385.0,2016.0,6.0,1580.0
2,790963.0,8.0,1.0,1.0,426.0,2017.0,6.0,1267.0
3,1053469.0,12.0,1.0,2.0,598.0,2017.0,6.0,1124.0
4,1381954.0,13.0,2.0,2.0,709.0,2017.0,6.0,974.0
5,1850381.0,12.0,3.0,2.0,894.0,2017.0,6.0,842.0
6,1937099.0,13.0,3.0,3.0,1014.0,2017.0,5.0,730.0
7,1775607.0,11.0,3.0,3.0,1218.0,2017.0,5.0,840.0
8,1462425.0,14.0,4.0,4.0,1750.0,2017.0,7.0,680.0
10,1607500.0,11.0,6.0,2.0,462.0,2017.0,8.0,245.0
12,710000.0,32.0,5.0,3.0,960.0,2017.0,10.0,1518.0


In [14]:
# Price as a list
grouped_rooms_df["Price"].to_list()

[435319.0,
 790963.0,
 1053469.0,
 1381954.0,
 1850381.0,
 1937099.0,
 1775607.0,
 1462425.0,
 1607500.0,
 710000.0]

# Line graph - Bathroom

In [15]:
# Grouped by bathroom, count aggregate applied
grouped_bath_counts = df.groupby("Bathroom").count()
grouped_bath_counts

Unnamed: 0_level_0,Rooms,Type,Price,Distance,Car,Landsize,Year,Month,Crime,landsize_group
Bathroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,34,34,34,34,34,34,34,34,34,34
1.0,8923,8923,8923,8923,8923,8923,8923,8923,8923,8853
2.0,6949,6949,6949,6949,6949,6949,6949,6949,6949,6874
3.0,1280,1280,1280,1280,1280,1280,1280,1280,1280,1268
4.0,151,151,151,151,151,151,151,151,151,146
5.0,43,43,43,43,43,43,43,43,43,43
6.0,9,9,9,9,9,9,9,9,9,8
7.0,2,2,2,2,2,2,2,2,2,2
8.0,2,2,2,2,2,2,2,2,2,2
9.0,1,1,1,1,1,1,1,1,1,1


In [16]:
grouped_bath_df = round((df.groupby("Bathroom").mean()),0)
grouped_bath_df

Unnamed: 0_level_0,Rooms,Price,Distance,Car,Landsize,Year,Month,Crime
Bathroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2.0,894559.0,7.0,0.0,256.0,2016.0,6.0,1080.0
1.0,3.0,880595.0,10.0,1.0,478.0,2017.0,6.0,1246.0
2.0,3.0,1196687.0,13.0,2.0,660.0,2017.0,6.0,1025.0
3.0,4.0,1747261.0,11.0,2.0,1021.0,2017.0,6.0,850.0
4.0,5.0,2636136.0,11.0,3.0,834.0,2017.0,6.0,716.0
5.0,5.0,2667488.0,12.0,2.0,762.0,2017.0,5.0,909.0
6.0,5.0,2361222.0,16.0,4.0,1803.0,2017.0,8.0,505.0
7.0,6.0,3425000.0,10.0,5.0,736.0,2016.0,8.0,397.0
8.0,6.0,1480000.0,8.0,6.0,952.0,2016.0,6.0,2890.0
9.0,10.0,2315000.0,10.0,2.0,612.0,2017.0,9.0,90.0


In [17]:
# Price as a list
grouped_bath_df["Price"].to_list()

[894559.0,
 880595.0,
 1196687.0,
 1747261.0,
 2636136.0,
 2667488.0,
 2361222.0,
 3425000.0,
 1480000.0,
 2315000.0]

# Line graph - Distance to CBD

In [18]:
# Inspecting min and max distance values
print(df['Distance'].min())
print(df['Distance'].max())

0.0
48.1


In [19]:
# Creating bins and labels
bins = [-1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
group_labels = ["0-5", "6-10", "11-15", "16-20", "21-25", "26-30", "31-35", "36-40", "41-45", "46-50"]

In [21]:
# Placing the data series into a new column inside of the DataFrame
df["kms_group"]=pd.cut(df["Distance"], bins, labels=group_labels)
df.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime,landsize_group,kms_group
0,2,h,1480000.0,2.5,1.0,1.0,202.0,2016,2,1053.0,201-400,0-5
1,2,h,1035000.0,2.5,1.0,0.0,156.0,2016,3,1053.0,0-200,0-5
2,4,h,1600000.0,2.5,1.0,2.0,120.0,2016,3,1053.0,0-200,0-5
3,2,h,941000.0,2.5,1.0,0.0,181.0,2016,6,1053.0,0-200,0-5
4,3,h,1876000.0,2.5,2.0,0.0,245.0,2016,6,1053.0,201-400,0-5


In [22]:
# Grouping by distance
grouped_dist_df = round((df.groupby(["kms_group"]).mean()),0)
grouped_dist_df

Unnamed: 0_level_0,Rooms,Price,Distance,Bathroom,Car,Landsize,Year,Month,Crime
kms_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0-5,2.0,1185662.0,3.0,1.0,1.0,645.0,2017.0,6.0,1447.0
6-10,3.0,1224077.0,7.0,2.0,2.0,472.0,2017.0,6.0,1169.0
11-15,3.0,1046433.0,12.0,2.0,2.0,547.0,2017.0,6.0,927.0
16-20,3.0,958513.0,17.0,2.0,2.0,648.0,2017.0,6.0,913.0
21-25,3.0,808006.0,22.0,2.0,2.0,685.0,2017.0,6.0,1310.0
26-30,3.0,661854.0,27.0,2.0,2.0,587.0,2017.0,6.0,704.0
31-35,3.0,560663.0,32.0,2.0,2.0,1667.0,2017.0,6.0,1601.0
36-40,3.0,735575.0,37.0,2.0,2.0,1112.0,2017.0,6.0,2899.0
41-45,4.0,571690.0,43.0,2.0,3.0,1171.0,2017.0,6.0,739.0
46-50,4.0,720314.0,46.0,2.0,2.0,12324.0,2017.0,6.0,765.0


In [23]:
# Price as a list
grouped_dist_df["Price"].tolist()

[1185662.0,
 1224077.0,
 1046433.0,
 958513.0,
 808006.0,
 661854.0,
 560663.0,
 735575.0,
 571690.0,
 720314.0]