In [1]:
import pandas as pd
import numpy as np
import sys
import warnings
import altair as alt

warnings.filterwarnings('ignore')
sys.path.append('../')
alt.data_transformers.disable_max_rows()

from src.cleaning import remove_few_entries
from src.cleaning import MINIMUM_ENTRIES_ACCEPTED
from src.metrics import get_stats_on_df
from IPython.display import display


In [2]:
mdata = pd.read_csv('../data/subset.csv')
mdata.head()

Unnamed: 0,id,ranking_score,geography_name,sq_meters,price,year_of_construction,ad_type,floor,rooms,subtype,price_per_sqrm
0,42911697,47.0,northern sub,567,200000,2005,simple,1,4,apartment,353.0
1,43018583,134.7,northern sub,642,550000,1992,simple,ground-floor,5,detached,857.0
2,39670968,71.0,northern sub,320,500000,2009,simple,ground-floor,2,detached,1562.0
3,39740833,122.0,northern sub,95,230000,2007,simple,1,3,apartment,2421.0
4,39742998,126.7,northern sub,195,370000,2007,simple,ground-floor,4,maisonette,1897.0


Because we will split our dataset to subcategories we have to ensure that our sample is representative. 

Thereby, any sub-category that includes less than 20 values will be exlcuded.
The number 20 is elected as the minimum accepted sample for statistical tests. 

Even though, there are no tests taking place at the moment, for future re-usability of the dataset it will be optimal to be cleaned this way.

In [3]:
mdata = remove_few_entries(mdata,'subtype',threshold=MINIMUM_ENTRIES_ACCEPTED)

Now for each different subtype we will calculate the basic metrics with respect to price_per_sqr and sqr_per_room

Our dataset is free of NA values and outliers that can corrupt the final result based on 3 times the standard deviation

This metrics will be:

- Mean
- Standard Deviation
- Median
- 25% and 75% QR
- Max and Minimum values
- Skewness and Kyrtosis
- IQR for the internal range of the majority of condos
- Range

In [4]:
for geography_area in mdata['geography_name'].unique():
    subsets_df = mdata[mdata['geography_name'] == geography_area]
    print("Geographical area:  -- {} -- statistics on price per square meter".format(geography_area))
    display(get_stats_on_df(df=subsets_df, index_column_name='subtype',column_name='price_per_sqrm'))

Geographical area:  -- northern sub -- statistics on price per square meter


Unnamed: 0,mean,median,min,max,25%,75%,range,IQR
apartment,2755.723113,2469.0,8.0,9960.0,2000.0,3209.5,9952.0,1209.5
detached,2647.818086,2353.0,250.0,12500.0,1800.0,3067.5,12250.0,1267.5
maisonette,2518.674433,2256.0,849.0,8368.0,1915.0,2877.5,7519.0,962.5
building,2529.285714,2165.0,317.0,5893.0,1467.0,3708.5,5576.0,2241.5
villa,2044.6875,2034.5,1121.0,3235.0,1580.75,2461.0,2114.0,880.25
studio,1981.0,2000.0,1905.0,2000.0,2000.0,2000.0,95.0,0.0
apartment complex,2671.0,2857.0,1263.0,5333.0,1535.5,3086.5,4070.0,1551.0
other residential,3808.461538,3190.0,1268.0,10000.0,2027.0,5000.0,8732.0,2973.0


Geographical area:  -- south beach -- statistics on price per square meter


Unnamed: 0,mean,median,min,max,25%,75%,range,IQR
apartment,3450.033458,3176.0,4.0,11765.0,2640.0,4000.0,11761.0,1360.0
detached,3282.356936,2857.0,967.0,16667.0,2304.0,3829.5,15700.0,1525.5
maisonette,4031.410714,3750.0,850.0,10417.0,2707.5,5072.0,9567.0,2364.5
studio,2459.342105,2265.0,625.0,4545.0,1841.5,3093.75,3920.0,1252.25
building,2213.371795,1939.5,818.0,4630.0,1654.25,2584.5,3812.0,930.25
villa,3400.846154,3021.0,2200.0,5435.0,2727.0,3672.0,3235.0,945.0
other residential,3564.636364,3247.0,1444.0,6018.0,2545.75,4702.5,4574.0,2156.75
apartment complex,2231.454545,2063.0,1176.0,3600.0,1658.0,2666.5,2424.0,1008.5


Geographical area:  -- gentrification area -- statistics on price per square meter


Unnamed: 0,mean,median,min,max,25%,75%,range,IQR
apartment,2607.42772,2500.0,625.0,7018.0,2083.0,3065.0,6393.0,982.0
building,2565.52,2412.0,1261.0,4583.0,1846.0,3000.0,3322.0,1154.0
detached,2679.884615,2769.5,857.0,4333.0,1987.5,3187.25,3476.0,1199.75
studio,2283.5625,2390.5,833.0,4167.0,1824.75,2875.0,3334.0,1050.25
maisonette,2620.833333,2286.0,1864.0,3929.0,2093.25,3225.0,2065.0,1131.75
other residential,2376.875,2313.5,1846.0,3400.0,1908.25,2675.25,1554.0,767.0
apartment complex,3224.5,3224.5,3202.0,3247.0,3213.25,3235.75,45.0,22.5


Geographical area:  -- beesy neighborhood -- statistics on price per square meter


Unnamed: 0,mean,median,min,max,25%,75%,range,IQR
apartment,1286.395981,1146.0,9.0,3958.0,960.75,1584.25,3949.0,623.5
building,1067.986301,1047.0,571.0,2044.0,861.0,1144.0,1473.0,283.0
maisonette,1808.8,1667.0,806.0,4688.0,1282.0,2237.5,3882.0,955.5
studio,1067.333333,1000.0,300.0,3200.0,699.0,1107.5,2900.0,408.5
detached,1462.028169,1250.0,429.0,5405.0,906.0,1854.0,4976.0,948.0
apartment complex,1116.0,1184.0,600.0,1571.0,735.0,1490.0,971.0,755.0
other residential,1196.0625,1206.5,649.0,1635.0,1064.5,1407.25,986.0,342.75


### Descriptive Viusalizations

In [49]:
display(mdata)

Unnamed: 0,id,ranking_score,geography_name,sq_meters,price,year_of_construction,ad_type,floor,rooms,subtype,price_per_sqrm
0,42911697,47.0,northern sub,567,200000,2005,simple,1,4,apartment,353.0
1,43018583,134.7,northern sub,642,550000,1992,simple,ground-floor,5,detached,857.0
2,39670968,71.0,northern sub,320,500000,2009,simple,ground-floor,2,detached,1562.0
3,39740833,122.0,northern sub,95,230000,2007,simple,1,3,apartment,2421.0
4,39742998,126.7,northern sub,195,370000,2007,simple,ground-floor,4,maisonette,1897.0
...,...,...,...,...,...,...,...,...,...,...,...
13540,43079210,101.0,beesy neighborhood,80,75000,1960,simple,3,2,apartment,938.0
13541,43079212,103.5,beesy neighborhood,69,70000,1975,simple,2,2,apartment,1014.0
13542,43079508,118.1,beesy neighborhood,75,120000,1974,simple,3,1,apartment,1600.0
13543,40498665,78.7,beesy neighborhood,800,600000,1930,simple,ground-floor,1,building,750.0


In [131]:

box_selector=alt.selection_multi(fields=['subtype','geography_name'], nearest=False, on='click',)
color_changer =alt.condition(box_selector, 'count():Q', alt.value('lightgray'), title = 'Fire incidents',
                       scale=alt.Scale(scheme="yelloworangered"),
                           legend=alt.Legend(orient = "left", offset = 0))


barchart = alt.Chart(mdata).mark_bar().encode(
    x= alt.X('geography_name:N', title='Geographic area'),
    y = 'count():Q'
).properties(
    width = 250,
    height = 250,
    title = 'Distribution of Condos per Geographic Area'
)

heatmap = alt.Chart(mdata).mark_rect().encode(
    x= alt.X('subtype:N', title='Type of Condo'),
    y = alt.Y('geography_name:N', title='Geographic Area'),
    color = alt.Color('mean(price_per_sqrm):Q', title='Average price Euro/m^2',scale=alt.Scale(scheme="blues")),
    tooltip = ['min(price_per_sqrm)','mean(price_per_sqrm)','max(price_per_sqrm)','count()']
).properties(
    width = 250,
    height = 200,
    title = 'Heatmap of Geographic area vs Subtype - interactive'
).add_selection(box_selector)

heatmap_rooms = alt.Chart(mdata).mark_rect().encode(
    x= alt.X('subtype:N', title='Type of Condo'),
    y = alt.Y('rooms:O', title='Rooms'),
    color = alt.Color('mean(price_per_sqrm):Q', title='Average price Euro/m^2',scale=alt.Scale(scheme="blues")),
    tooltip = ['mean(price_per_sqrm)','min(price_per_sqrm)','max(price_per_sqrm)']
).properties(
    width = 250,
    height = 250,
    title = 'Heatmap of Rooms according to subtype'
).transform_filter(
    box_selector
)

heatmap_floor = alt.Chart(mdata).mark_rect().encode(

    x = alt.X('floor:O', title='Floor'),
    y= alt.Y('subtype:N', title='Subtype'),
    color = alt.Color('mean(price_per_sqrm):Q',),
    tooltip = ['mean(price_per_sqrm)','min(price_per_sqrm)','max(price_per_sqrm)']
).transform_filter(
    box_selector
)


scatterplot_details = alt.Chart(mdata).mark_point(color='green').encode(
    x=alt.X('price:Q', title='Price'),
    y=alt.Y('sq_meters:Q', title='Area in m^2'),
    tooltip=['id', 'price', 'sq_meters', 'rooms', 'floor']
).properties(
    title='Condos based on selected attributes'
).transform_filter(
    box_selector
)

dashboard = (barchart | heatmap | heatmap_rooms) & (heatmap_floor | scatterplot_details)

## Interactive Dashboard

To finilize this task and to help Marketing team to get On-Demand statistics we created an interactive dashboard where the user can click on the heatmap and 
get the results on the rest of the graphs. The top-left bargraph has no effect of this interactive process though. 

Furthermore, the user can multi-select on the heatmap on the top-middle position to select multiple areas, whilst on the scatterplot additional details for each condo are shown.

To reset any selection the user has to double click anywhere on the heatmap

In [132]:
dashboard