# Interactive Visualization Assignment (Plotly + IPyWidgets)

In [1]:
import pandas as pd
import plotly.express as px
from ipywidgets import interact
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MaxAbsScaler

The following makes your colab session act more like a jupyter notebook for displaying plotly and ipywidgets objects correctly. If you are using jupyter, comment it out. (It needs to be called at the top of every cell that outputs a plot that uses widgets.)

```
configure_plotly_browser_state()
```

From [this StackOverflow answer](https://stackoverflow.com/a/47230966).

In [2]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=False)

### Import the housing.csv data set.

In [3]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/housing.csv')

### Separate out all the numeric fields into their own data set.

Remove the Id field as it should be categorical and the GarageYrBlt and LotFrontage fields that contains many nulls. Drop any remaining nulls, but only for fields that are numeric. Hint: use subset. Then create a new dataframe with only the numeric fields in it.

In [4]:
data = data.drop(['Id', 'GarageYrBlt','LotFrontage'], axis=1)
num_cols = list(data.select_dtypes(exclude='O').columns)
data = data.dropna(subset=num_cols)
nums = data.select_dtypes(exclude='O')

### Normalize the numeric data using Scikit-Learn's MaxAbsScaler.

In [5]:
masc = MaxAbsScaler()
nums_sc = pd.DataFrame(masc.fit_transform(nums), columns=nums.columns)

### Iteratively K-Means cluster the normalized data and generate an interactive line chart showing the average silhouette score for each number of clusters (2 through 20).

In [6]:
scores = []

for k in range(2, 21):
  kmeans = KMeans(n_clusters=k)
  clusters = kmeans.fit_predict(nums_sc)
  scores.append(silhouette_score(nums_sc, clusters))

In [7]:
kmeans_scores = pd.DataFrame()
kmeans_scores['Clusters'] = range(2,21)
kmeans_scores['Scores'] = scores

In [8]:
fig = px.line(kmeans_scores, x='Clusters', y='Scores',
              title='Avg. Silhouette Score for K Clusters',
              ).update(layout=dict(title=dict(x=.5)))

fig.show()

### Choose a number of clusters, run KMeans with that value for k on the scaled data, and add a column to the original housing data set containing the cluster that each record is assigned to.

In [9]:
kmeans = KMeans(n_clusters=4)
data['Cluster'] = kmeans.fit_predict(nums_sc)

### Create an interactive bar chart that shows the average SalePrice of a property by cluster.

You will need to aggregate the data by cluster and average the sale prices before generating your visualization.

In [10]:
cluster_price = data.groupby(by='Cluster').mean()
cluster_price['Cluster'] = (cluster_price.index + 1)

In [11]:
fig = px.bar(cluster_price, x='Cluster', y='SalePrice',
             title='Avg. Price by cluster',
             )
fig.show()

### Create another bar chart where the bars are broken down and color-coded by the year the property was sold.

You will need to convert the YrSold field to be categorical in order to separate the bars based on that field.

In [21]:
cluster_yrs = data.groupby(['Cluster', 'YrSold']).mean().reset_index()
cluster_yrs['YrSold'] = cluster_yrs['YrSold'].astype(object)

In [13]:
# cluster_yrs = data.groupby(by=['Cluster', 'YrSold']).agg({'SalePrice':'mean'})
# cluster_yrs['Cluster'] = cluster_yrs.index.get_level_values('Cluster')
# cluster_yrs['YrSold'] = cluster_yrs.index.get_level_values('YrSold')
# cluster_yrs =  cluster_yrs.droplevel(level=0)
# cluster_yrs.index = cluster_yrs.index.set_names(['index'])
# cluster_yrs['YrSold'] = cluster_yrs['YrSold'].astype(object)

In [22]:
fig = px.bar(cluster_yrs, x='Cluster', y='SalePrice', 
             color='YrSold', template=None,
             title='Avg. Sale Price by Cluster and Year Sold'
             )
fig.update_layout(barmode='group')

fig.show()

### Add a drop-down widget to the multi-bar chart you created above that lets you choose between 4 numeric fields to represent on the Y axis. 

In [28]:
nums

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2,548,0,61,0,0,0,0,0,2,2008,208500
1,20,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,2,460,298,0,0,0,0,0,0,5,2007,181500
2,60,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2,608,0,42,0,0,0,0,0,9,2008,223500
3,70,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,3,642,0,35,272,0,0,0,0,2,2006,140000
4,60,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,3,836,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,2000,0.0,0,0,953,953,953,694,0,1647,0,0,2,1,3,1,7,1,2,460,0,40,0,0,0,0,0,8,2007,175000
1456,20,13175,6,6,1978,1988,119.0,790,163,589,1542,2073,0,0,2073,1,0,2,0,3,1,7,2,2,500,349,0,0,0,0,0,0,2,2010,210000
1457,70,9042,7,9,1941,2006,0.0,275,0,877,1152,1188,1152,0,2340,0,0,2,0,4,1,9,2,1,252,0,60,0,0,0,0,2500,5,2010,266500
1458,20,9717,5,6,1950,1996,0.0,49,1029,0,1078,1078,0,0,1078,1,0,1,0,2,1,5,0,1,240,366,0,112,0,0,0,0,4,2010,142125


In [46]:
@interact(Metric=['LotArea', 'OverallQual', 'SalePrice', 'TotalBsmtSF'],
         )

def barchart(Metric):
  fig = px.bar(cluster_yrs, x='Cluster', y=Metric, 
              color='YrSold', template=None,
              title='Avg. ' + Metric + ' by Cluster and Year Sold'
              )
  fig.update_layout(barmode='group')

  fig.show()

interactive(children=(Dropdown(description='Metric', options=('LotArea', 'OverallQual', 'SalePrice', 'TotalBsm…

### Create a scatter plot that shows the relationship between SalesPrice and LotArea, color-coded by cluster. Add a slider that filters the data by the year the property was sold.

In [41]:
# removing a couple outliers to make graph more readable
data = data[data['LotArea'] < 100000]

In [48]:
@interact(YrSold = (data['YrSold'].min(), data['YrSold'].max()))

def barchart(YrSold):
  filtered = data[data['YrSold']==YrSold]
  fig=px.scatter(filtered, x='LotArea', y='SalePrice', color='Cluster',
                 title='Sales Price and Lot Area for houses sold in '+ str(YrSold))
  fig.show()

interactive(children=(IntSlider(value=2008, description='YrSold', max=2010, min=2006), Output()), _dom_classes…