# Interactive Visualization Assignment (Plotly + IPyWidgets)

In [1]:
import pandas as pd
import plotly.express as px
from ipywidgets import interact
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MaxAbsScaler
import plotly.graph_objects as go

The following makes your colab session act more like a jupyter notebook for displaying plotly and ipywidgets objects correctly. If you are using jupyter, comment it out. (It needs to be called at the top of every cell that outputs a plot that uses widgets.)

```
configure_plotly_browser_state()
```

From [this StackOverflow answer](https://stackoverflow.com/a/47230966).

In [None]:
  # def configure_plotly_browser_state():
  # import IPython
  # display(IPython.core.display.HTML('''
  #     <script src="/static/components/requirejs/require.js"></script>
  #     <script>
  #       requirejs.config({
  #         paths: {
  #           base: '/static/base',
  #           plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
  #         },
  #       });
  #     </script>
  #     '''))
  # from plotly.offline import init_notebook_mode
  # init_notebook_mode(connected=False)

### Import the housing.csv data set.

In [2]:
def get_df(url):
  df = pd.read_csv(url)
  return df

In [6]:
url = 'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/housing.csv'
data = get_df(url)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,...,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,...,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,...,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,...,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,...,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


### Separate out all the numeric fields into their own data set.

Remove the Id field as it should be categorical and the GarageYrBlt and LotFrontage fields that contains many nulls. Drop any remaining nulls, but only for fields that are numeric. Hint: use subset. Then create a new dataframe with only the numeric fields in it.

In [9]:
nums = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(columns=['Id', 'GarageYrBlt', 'LotFrontage']).dropna()
nums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452 entries, 0 to 1459
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1452 non-null   int64  
 1   LotArea        1452 non-null   int64  
 2   OverallQual    1452 non-null   int64  
 3   OverallCond    1452 non-null   int64  
 4   YearBuilt      1452 non-null   int64  
 5   YearRemodAdd   1452 non-null   int64  
 6   MasVnrArea     1452 non-null   float64
 7   BsmtFinSF1     1452 non-null   int64  
 8   BsmtFinSF2     1452 non-null   int64  
 9   BsmtUnfSF      1452 non-null   int64  
 10  TotalBsmtSF    1452 non-null   int64  
 11  1stFlrSF       1452 non-null   int64  
 12  2ndFlrSF       1452 non-null   int64  
 13  LowQualFinSF   1452 non-null   int64  
 14  GrLivArea      1452 non-null   int64  
 15  BsmtFullBath   1452 non-null   int64  
 16  BsmtHalfBath   1452 non-null   int64  
 17  FullBath       1452 non-null   int64  
 18  HalfBath

In [11]:
data.shape

(1460, 81)

### Normalize the numeric data using Scikit-Learn's MaxAbsScaler.

In [14]:
scale = MaxAbsScaler()
nummas = scale.fit_transform(nums)

### Iteratively K-Means cluster the normalized data and generate an interactive line chart showing the average silhouette score for each number of clusters (2 through 20).

In [19]:
from sklearn import metrics
results = pd.DataFrame(columns=['cluster', 'score'])
K = range(2,20) 
metric='minkowski'
goal = 0

for k in K: 

  #Building and fitting the model 
  kmeanModel = KMeans(n_clusters=k).fit(nummas) 
  y_pred = kmeanModel.fit_predict(nummas)

  #append the scores.
  results = results.append({"cluster" : k , "score" : metrics.silhouette_score(nummas, y_pred, metric=metric)}, ignore_index=True)

In [20]:
fig = px.line(results, x='cluster', y='score', template='none').update(layout=dict(title=dict(x=0.5)))
fig.show()

### Choose a number of clusters, run KMeans with that value for k on the scaled data, and add a column to the original housing data set containing the cluster that each record is assigned to.

In [23]:
#Building and fitting the model 
kmeanModel = KMeans(n_clusters=2).fit(nummas) 
y_pred = kmeanModel.fit_predict(nummas)

nums['clusters'] = y_pred
nums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452 entries, 0 to 1459
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1452 non-null   int64  
 1   LotArea        1452 non-null   int64  
 2   OverallQual    1452 non-null   int64  
 3   OverallCond    1452 non-null   int64  
 4   YearBuilt      1452 non-null   int64  
 5   YearRemodAdd   1452 non-null   int64  
 6   MasVnrArea     1452 non-null   float64
 7   BsmtFinSF1     1452 non-null   int64  
 8   BsmtFinSF2     1452 non-null   int64  
 9   BsmtUnfSF      1452 non-null   int64  
 10  TotalBsmtSF    1452 non-null   int64  
 11  1stFlrSF       1452 non-null   int64  
 12  2ndFlrSF       1452 non-null   int64  
 13  LowQualFinSF   1452 non-null   int64  
 14  GrLivArea      1452 non-null   int64  
 15  BsmtFullBath   1452 non-null   int64  
 16  BsmtHalfBath   1452 non-null   int64  
 17  FullBath       1452 non-null   int64  
 18  HalfBath

### Create an interactive bar chart that shows the average SalePrice of a property by cluster.

You will need to aggregate the data by cluster and average the sale prices before generating your visualization.

In [32]:
avg = nums.groupby(by='clusters')['SalePrice'].mean()
fig = px.bar(nums, x='clusters', y='SalePrice')
fig.show()

### Create another bar chart where the bars are broken down and color-coded by the year the property was sold.

You will need to convert the YrSold field to be categorical in order to separate the bars based on that field.

In [35]:
nums['YrSold'] = nums['YrSold'].astype('category', copy=False)
fig = px.bar(nums, x='clusters', y='SalePrice', color='YrSold')
fig.update_layout(barmode='group')
fig.show()

### Add a drop-down widget to the multi-bar chart you created above that lets you choose between 4 numeric fields to represent on the Y axis. 

In [45]:
@interact(Metric=['BsmtFullBath', 'GarageArea', 'SalePrice', 'OverallCond' ], YearBuilt = (nums['YearBuilt'].min(), nums['YearBuilt'].max()))

def barchart(Metric, YearBuilt):
  filtered = nums[nums['YearBuilt'] == YearBuilt]
  grouped = filtered.groupby(['clusters', 'YrSold']).mean().reset_index()
  grouped['YrSold'] = grouped['YrSold'].astype('object')

  fig = px.bar(grouped, x='clusters', y=Metric, color='YrSold', template='none')
  fig.update_layout(barmode='group')
  fig.show()



interactive(children=(Dropdown(description='Metric', options=('BsmtFullBath', 'GarageArea', 'SalePrice', 'Over…

### Create a scatter plot that shows the relationship between SalesPrice and LotArea, color-coded by cluster. Add a slider that filters the data by the year the property was sold.

In [58]:
nums['YrSold'] = nums['YrSold'].cat.as_ordered()
@interact(YrSold = (nums['YrSold'].min(), nums['YrSold'].max()))

def scatterplot(YrSold):
  filtered = nums[nums['YrSold'] == YrSold]
  fig = px.scatter(
      filtered,
      x="SalePrice",
      y="LotArea",
      color="clusters",
      template="none",
  )

  fig.update_traces(marker_line_color="black", marker_line_width=1)

  fig.show()

interactive(children=(IntSlider(value=2008, description='YrSold', max=2010, min=2006), Output()), _dom_classes…