# Housing Dataset Visualization Demonstration

In [61]:
import numpy as np
import pandas as pd
import glob
import os
import xgboost
import csv as csv
import ipychart as ipc
from ipychart import Chart
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score,KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# from sklearn.grid_search import GridSearchCV   #Perforing grid search
from scipy.stats import skew
from collections import OrderedDict

### Initialize data and variables

In [62]:
dataFrame = pd.read_csv("./CombinedData.csv", header = 0, low_memory=False)
# dataFrame.insert(0, 'ID', range(0,len(dataFrame)))
dataFrame = dataFrame.sample(frac=1).reset_index(drop=True)
print(dataFrame.head(5))

         SOLD DATE              PROPERTY TYPE              ADDRESS  \
0   August-29-2022  Single Family Residential        239 Raper Cir   
1   August-15-2022  Single Family Residential  1259 Kent Downs Ave   
2   August-19-2022  Single Family Residential     1119 Blueview Dr   
3  November-3-2021  Single Family Residential      5205 Seward Cir   
4    April-27-2022  Single Family Residential          5316 Cox Ln   

             CITY STATE OR PROVINCE ZIP OR POSTAL CODE   PRICE  BEDS  BATHS  \
0       Lexington                NC              27295  315500   3.0    2.0   
1         Concord                NC              28027  373000   4.0    2.5   
2  Black Mountain                NC              28704  440000   3.0    2.0   
3       Pfafftown                NC              27106  212000   3.0    2.0   
4         Liberty                NC              27205  430000   3.0    2.0   

           LOCATION  SQUARE FEET  LOT SIZE  YEAR BUILT  $/SQUARE FEET  \
0  Cardinal Terrace          Na

In [63]:
dataFrame = dataFrame.drop(columns=['URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)'], axis = 1)

### General ZIP code distribution chart

In [65]:
ipc.countplot(data=dataFrame, x='ZIP OR POSTAL CODE')

Chart(layout=Layout(align_self='stretch', height='auto'))

### Targeted information on line plot

For this example, let's assume we're interested in information regarding price based on sales date. In this regard, although it might be useful for a user to know in general the rough trends of housing such as the example given below, we see that a refinement to either be able to specify a State, Zip, or City would be far more beneficial to the end user.

In [66]:
datalabels_arguments = {'display': False, 'borderWidth': 1, 'anchor': 'end', 
                        'align': 'end', 'borderRadius': 5, 'color': '#fff'}

ipc.lineplot(data=dataFrame,
             x='SOLD DATE',
             y='PRICE',
             hue='STATE OR PROVINCE', 
             dataset_options={'fill': False, 'datalabels': datalabels_arguments}, 
             colorscheme='office.Parallax6')

Chart(layout=Layout(align_self='stretch', height='auto'))

We can fix this problem by shaping the data before sending to a chart such as the below process, demonstrating finding only houses sold in North Carolina in our dataset using boolean masks.

In [67]:
print(dataFrame.set_index('STATE OR PROVINCE').loc['NC'])

                         SOLD DATE              PROPERTY TYPE  \
STATE OR PROVINCE                                               
NC                  August-29-2022  Single Family Residential   
NC                  August-15-2022  Single Family Residential   
NC                  August-19-2022  Single Family Residential   
NC                 November-3-2021  Single Family Residential   
NC                   April-27-2022  Single Family Residential   
...                            ...                        ...   
NC                    July-28-2022  Single Family Residential   
NC                  August-12-2022  Single Family Residential   
NC                             NaN                Condo/Co-op   
NC                 January-31-2022  Single Family Residential   
NC                    March-8-2022  Single Family Residential   

                                ADDRESS            CITY ZIP OR POSTAL CODE  \
STATE OR PROVINCE                                                           

In the following instance we locate all properties in North Carolina by checking the column of values STATE OR PROVINCE. Included in this example is a simple passing of options structured as python dicts to our chart via variable. This is done to as closely mirror what the options of chart.js on the front-end would look like, and was deliberately kept simplistic for demonstration. 

In [90]:
onlyNCHouses = dataFrame.set_index('STATE OR PROVINCE', drop=False).loc['NC']
onlyGboroHouses = dataFrame.set_index('CITY', drop=False).loc['Greensboro']
onlyZIPHouses = dataFrame.set_index('ZIP OR POSTAL CODE', drop=False).loc['27577']



datalabels_arguments = {'display': False, 'borderWidth': 1, 'anchor': 'end', 
                        'align': 'end', 'borderRadius': 5, 'color': '#fff'}
state_options = {
    'plugins': {
    'title': {
            'display': True,
            'text': 'Houses in North Carolina',
            },
    'legend':  {
            'display': True,
            'labels': {'boxWidth': 30, 'fontSize': 14},       
        },
    }
}
city_options = {
    'plugins': {
    'title': {
            'display': True,
            'text': 'Houses in Greensboro, NC',
            },
    'legend':  {
            'display': True,
            'labels': {'boxWidth': 30, 'fontSize': 14},       
        },
    }
}
zip_options = {
    'plugins': {
    'title': {
            'display': True,
            'text': 'Houses in Zip Code 27577',
            },
    'legend':  {
            'display': True,
            'labels': {'boxWidth': 30, 'fontSize': 14},       
        },
    }
}
stateChart =  ipc.lineplot(data=onlyNCHouses,
             x='SOLD DATE',
             y='PRICE',
             hue='STATE OR PROVINCE', 
             dataset_options={'fill': False, 'datalabels': datalabels_arguments}, 
             colorscheme='office.Parallax6',
                           options = state_options)
cityChart =  ipc.lineplot(data=onlyGboroHouses,
             x='SOLD DATE',
             y='PRICE',
             hue='CITY', 
             dataset_options={'fill': False, 'datalabels': datalabels_arguments}, 
             colorscheme='office.Parallax6',
                           options = city_options)
zipChart =  ipc.lineplot(data=onlyZIPHouses,
             x='SOLD DATE',
             y='PRICE',
             hue='ZIP OR POSTAL CODE', 
             dataset_options={'fill': False, 'datalabels': datalabels_arguments}, 
             colorscheme='office.Parallax6',
                           options = zip_options)
stateChart

Chart(layout=Layout(align_self='stretch', height='auto'))

### Repeating process for zip code

As this process has now previously been explained, what follows are the same dataset of houses in zip code 27577.

In [91]:
zipChart

Chart(layout=Layout(align_self='stretch', height='auto'))

And we do the same for cities.

In [89]:
cityChart

Chart(layout=Layout(align_self='stretch', height='auto'))