In [139]:
# Import required libraries
from bokeh.plotting import figure
from bokeh.io import output_file, show,output_notebook, curdoc
import pandas as pd
# Import column
from bokeh.layouts import column, row, gridplot
#from bokeh.resources import INLINE
# Import ColumnDataSource
from bokeh.models import ColumnDataSource, NumeralTickFormatter, DatetimeTickFormatter, FactorRange

output_notebook()

In [113]:
# Import the data set

melb = pd.read_csv('../data/melb_clean.csv')
melb.info()
#melb.shape

# Create two subsets of melb based on which region a property is located in, north and south
north = melb.loc[melb["region"] == "Northern"]
south = melb.loc[melb["region"] == "Southern"]

# Create three subsets of melb based on year a property was built and its distance to the Central Business District (CBD), houses, units, and townhouses
houses = melb.loc[melb["type"] == "h"]
units = melb.loc[melb["type"] == "u"]
townhouses = melb.loc[melb["type"] == "t"]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     13580 non-null  int64  
 1   rooms          13580 non-null  int64  
 2   type           13580 non-null  object 
 3   price          13580 non-null  float64
 4   date           13580 non-null  object 
 5   distance       13580 non-null  float64
 6   bedrooms       13580 non-null  float64
 7   bathrooms      13580 non-null  float64
 8   car            13518 non-null  float64
 9   land_area      13580 non-null  float64
 10  building_area  7130 non-null   float64
 11  year_built     8205 non-null   float64
 12  council_area   12211 non-null  object 
 13  region         13580 non-null  object 
dtypes: float64(8), int64(2), object(4)
memory usage: 1.5+ MB


## Colors, legend, and theme

In [110]:
# Change theme to contrast
curdoc().theme = "contrast"

fig = figure(x_axis_label="Year Built", y_axis_label="Land Area (Meters Squared)",width=1200, height=1200)

#Add circle glyphs for north, setting x and y to represent year_built and land_area, respectively, setting color to "yellow" and legend_label to "North".
# Add north circle glyphs
fig.circle(x=north["year_built"], y=north["land_area"],  radius=0.5, color="yellow", legend_label="North")

#Repeat for south, setting color to "red" and legend_label to "South".
# Add south circle glyphs
fig.circle(x=south["year_built"], y=south["land_area"], radius=0.5, color="red", legend_label="South")

show(fig)


In [None]:
# Create figure
fig = figure(x_axis_label="Year Built", y_axis_label="Distance from CBD (km)",width=1200, height=1200)

# Add circle glyphs for houses
fig.circle(x=houses["year_built"], y=houses["distance"],  radius=0.8, legend_label="House", color="purple")

# Add square glyphs for units
fig.scatter(x=units["year_built"], y=units["distance"], marker='square', legend_label="Unit", color="red")

# Add triangle glyphs for townhouses
fig.scatter(x=townhouses ["year_built"], y=townhouses ["distance"], marker='triangle', legend_label="Townhouse", color="green")
#output_file(filename="year_built_vs_distance_by_property_type.html")
show(fig)

## Customizing Axes

In [103]:
# Group melb by date and calculate the mean of "building_area".
prop_size = melb.groupby("date", as_index=False)["building_area"].mean()
# Converting 'Date' column to datetime
prop_size['date'] = pd.to_datetime(prop_size['date'], format='mixed')
prop_size = prop_size.sort_values(by="date")
#display(prop_size)
source = ColumnDataSource(data=prop_size)

# Create fig, labeling the x and y-axes as "Date" and "Building Size (Meters Squared)", respectively, and setting the x-axis ticks to datetime format.
fig = figure(x_axis_label="Date", y_axis_label="Building Size (Meters Squared)", x_axis_type="datetime" ,width=1200, height=1200)

# Add line glyphs to fig to visualize building_area versus date, using source.
# Add line glyphs
fig.line(x='date', y='building_area', line_color="red", line_width=4, line_alpha=0.6, source=source)

show(fig)

In [104]:
# Group melb DataFrame by date, this time calculating total sales using the sum of the price column, and stored as melb_sales
melb_sales = melb.groupby("date", as_index=False)["price"].sum()
# Converting 'date' column to datetime
melb_sales['date'] = pd.to_datetime(melb_sales['date'], format='mixed')
melb_sales = melb_sales.sort_values(by= 'date')
source = ColumnDataSource(data=melb_sales)

fig = figure(x_axis_label="Date", y_axis_label="Sales", x_axis_type="datetime",width=1200, height=1200)

# Add line glyphs to the figure, assigning y as"price" versus x as"date" from source.
# Add line glyphs
fig.line(x='date', y='price', line_color="red", line_width=4, line_alpha=0.6, source=source)

# Update the format of the x-axis to months as three characters, and years as 4 digits.
fig.xaxis[0].formatter = DatetimeTickFormatter(months="%b %Y")

# Set the y-axis format as "$0a" to display in millions of dollars.
fig.yaxis[0].formatter = NumeralTickFormatter(format="$0a")

show(fig)

## Subplots

### Subplots by column

In [109]:
# Group melb DataFrame by region, and the average values for land_area and building_area
melb_sales = melb.groupby("region", as_index=False).agg(land_area = ('land_area','mean'), building_area = ('building_area','mean'))
#display(melb_sales)
source = ColumnDataSource(data=melb_sales)

regions = ["Eastern", "Southern", "Western", "Northern"]

#Add bar glyphs to building_size, plotting the "building_area" for each "region".
building_size = figure(x_axis_label="Region", y_axis_label="Building Size (Meters Squared)", x_range=regions)

#Add bar glyphs to land_size, representing the "land_area" for each "region"
land_size = figure(x_axis_label="Region", y_axis_label="Land Size (Meters Squared)", x_range=regions)

# Add bar glyphs
building_size.vbar(x="region", top="building_area", source=source)
land_size.vbar(x="region", top="land_area", source=source)

show(column(building_size, land_size))


### Subplots by row

In [121]:
source = ColumnDataSource(data=melb)
building_size = figure(x_axis_label="Building Area (Meters Squared)", y_axis_label="Sales")
distance = figure(x_axis_label="Distance from CBD (km)", y_axis_label="Sales")

# Add circle glyphs to both figures, representing "price" on the y-axis versus "building_area" in building_size, and "price" on the y-axis versus "distance" in distance.
# # Add circle glyphs
building_size.circle(x="building_area", y="price", radius=0.5, color="red", source=source)
distance.circle(x="distance", y="price", radius=0.5, color="blue", source=source)

#Update the y-axis of both figures to display in the format of $0a, for millions of dollars.
building_size.yaxis[0].formatter = NumeralTickFormatter(format="$0a")
distance.yaxis[0].formatter = NumeralTickFormatter(format="$0a")

show(row(building_size, distance))

### Grid Subplot

In [120]:
# Create df by filtering melb for the desired region.
plots = []

# Complete for loop to create plots
for region in ["Northern", "Western", "Southern", "Eastern"]:
  df = melb.loc[melb["region"] == region]
  source = ColumnDataSource(data=df)
  fig = figure(x_axis_label="Building Area (Meters Squared)", y_axis_label="Price")
  fig.circle(x="building_area", y="price", source=source, radius=0.5, legend_label=region)
  fig.yaxis[0].formatter = NumeralTickFormatter(format="$0a")
  plots.append(fig)
  
show(gridplot(plots, ncols=2))  
  

## Viz Categorical Data

In [129]:
# Group melb DataFrame by region, and the average price
regions = melb.groupby("region", as_index=False)["price"].mean()

# Sort df by price in descending order
regions = regions.sort_values("price", ascending=False)
#display(regions)

# Create the figure, setting x_range equal to the "region" column of regions and labeling the x- and y-axes as "Region" and "Sales", respectively.
# Create figure
fig = figure(x_range=regions["region"], x_axis_label="Region", y_axis_label="Sales")

#Add bar glyphs from regions, showing the price on the y-axis against each region on the x-axis, and setting the width to 0.9
fig.vbar(x=regions["region"], top=regions["price"], width=0.9)

# Update the y-axis format to display in millions of dollars with 1 decimal place.
fig.yaxis[0].formatter = NumeralTickFormatter(format="$0.0a")
fig.xaxis.major_label_orientation = 45

show(fig)

### Nested Categories

In [142]:
# Complete factors, entering the relevant quarters and associated months.
melb_modified = melb
melb_modified['date'] = pd.to_datetime(melb_modified['date'], format='mixed')
melb_modified = melb_modified.sort_values(by= 'date')
melb_modified["month"] = melb_modified["date"].dt.month
quarters = {1: "Q1", 2:"Q1", 3:"Q1", 4:"Q2", 5:"Q2", 6:"Q2", 7:"Q3", 8:"Q3", 9:"Q3", 10:"Q4", 11:"Q4", 12:"Q4"}
melb_modified["quarter"] = melb_modified["month"].replace(quarters)
melb_modified["month"] = melb_modified["month"].replace({1:"January", 2:"February", 3:"March", 4:"April", 5:"May", 6:"June", 7:"July", 8:"August", 9:"September", 10:"October", 11:"November", 12:"December"})

# Create factors
factors = [("Q1", "January"), ("Q1", "February"), ("Q1", "March"), 
           ("Q2", "April"), ("Q2", "May"), ("Q2", "June"), 
           ("Q3", "July"), ("Q3", "August"), ("Q3", "September"), 
           ("Q4", "October"), ("Q4", "November"), ("Q4", "December")]

#display(melb_modified)
#Create grouped_melb by grouping melb by "month" and "quarter", calculating the total of the "price" column.
grouped_melb = melb_modified.groupby(["month", "quarter"], as_index=False)["price"].sum()
grouped_melb.sort_values("quarter", inplace=True)
#print(grouped_melb.head())

#Create the figure, using FactorRange() and factors for the x-axis, and labeling the y-axis as "Sales"
fig = figure(x_range=FactorRange(*factors), y_axis_label="Sales")    

# Add bar glyphs, setting x as factors, top as the "price" column of grouped_melb, and width as 0.9.
fig.vbar(x=factors, top=grouped_melb["price"], width=0.9)
fig.yaxis[0].formatter = NumeralTickFormatter(format="$0.0a")

# Rotate the x-axis labels
fig.xaxis.major_label_orientation = 45

show(fig)