# Interactive Visualization Assignment (Plotly + IPyWidgets)

In [1]:
import pandas as pd
import plotly.express as px
from ipywidgets import interact
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MaxAbsScaler

The following makes your colab session act more like a jupyter notebook for displaying plotly and ipywidgets objects correctly. If you are using jupyter, comment it out. (It needs to be called at the top of every cell that outputs a plot that uses widgets.)

```
configure_plotly_browser_state()
```

From [this StackOverflow answer](https://stackoverflow.com/a/47230966).

In [2]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=False)

### Import the housing.csv data set.

In [3]:
data = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/housing.csv"
)
data.shape

(1460, 81)

### Separate out all the numeric fields into their own data set.

Remove the Id field as it should be categorical and the GarageYrBlt and LotFrontage fields that contains many nulls. Drop any remaining nulls, but only for fields that are numeric. Hint: use subset. Then create a new dataframe with only the numeric fields in it.

In [7]:
df_num = data.select_dtypes(include='number')
df_num = df_num.dropna()
df_num.shape

(1121, 38)

### Normalize the numeric data using Scikit-Learn's MaxAbsScaler.

In [8]:
scale = MaxAbsScaler()
X_scale = pd.DataFrame(scale.fit_transform(df_num), columns=df_num.columns)

In [9]:
X_scale.isnull().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

In [10]:
X_scale.dropna(inplace=True)

### Iteratively K-Means cluster the normalized data and generate an interactive line chart showing the average silhouette score for each number of clusters (2 through 20).

In [11]:
n_clusters = range(2, 21)
scores = []
for k in n_clusters:
    kmeans = KMeans(n_clusters=k)
    clusters = kmeans.fit_predict(X_scale)
    score = silhouette_score(X_scale, clusters)
    scores.append(score)
scores_df = pd.DataFrame(columns=["Clusters", "Silhouette Score"])

In [12]:
scores_df.Clusters = n_clusters
scores_df["Silhouette Score"] = scores
scores_df

Unnamed: 0,Clusters,Silhouette Score
0,2,0.146997
1,3,0.118765
2,4,0.11148
3,5,0.110838
4,6,0.11662
5,7,0.111552
6,8,0.110052
7,9,0.104845
8,10,0.106751
9,11,0.109803


In [13]:
px.line(
    scores_df,
    x="Clusters",
    y="Silhouette Score",
    title="Average Silhouette Score for k Clusters",
)

### Choose a number of clusters, run KMeans with that value for k on the scaled data, and add a column to the original housing data set containing the cluster that each record is assigned to.

In [17]:
kmeans = KMeans(n_clusters=18)
df_num["Cluster"] = kmeans.fit_predict(X_scale)
df_num.reset_index()
df_num

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,Cluster
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,0,150,856,856,854,0,1710,1,0,2,1,3,1,8,0,2003.0,2,548,0,61,0,0,0,0,0,2,2008,208500,3
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,0,284,1262,1262,0,0,1262,0,1,2,0,3,1,6,1,1976.0,2,460,298,0,0,0,0,0,0,5,2007,181500,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,0,434,920,920,866,0,1786,1,0,2,1,3,1,6,1,2001.0,2,608,0,42,0,0,0,0,0,9,2008,223500,3
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,0,540,756,961,756,0,1717,1,0,1,0,3,1,7,1,1998.0,3,642,0,35,272,0,0,0,0,2,2006,140000,11
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,0,490,1145,1145,1053,0,2198,1,0,2,1,4,1,9,1,2000.0,3,836,192,84,0,0,0,0,0,12,2008,250000,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,953,953,953,694,0,1647,0,0,2,1,3,1,7,1,1999.0,2,460,0,40,0,0,0,0,0,8,2007,175000,4
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,589,1542,2073,0,0,2073,1,0,2,0,3,1,7,2,1978.0,2,500,349,0,0,0,0,0,0,2,2010,210000,9
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,877,1152,1188,1152,0,2340,0,0,2,0,4,1,9,2,1941.0,1,252,0,60,0,0,0,0,2500,5,2010,266500,17
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,0,1078,1078,0,0,1078,1,0,1,0,2,1,5,0,1950.0,1,240,366,0,112,0,0,0,0,4,2010,142125,6


### Create an interactive bar chart that shows the average SalePrice of a property by cluster.

You will need to aggregate the data by cluster and average the sale prices before generating your visualization.

In [18]:
px.bar(
    df_num.groupby(by=["Cluster"]).mean()["SalePrice"].reset_index(),
    x="Cluster",
    y="SalePrice",
    title="Average Sale Price Score by Cluster",
)

### Create another bar chart where the bars are broken down and color-coded by the year the property was sold.

You will need to convert the YrSold field to be categorical in order to separate the bars based on that field.

In [19]:
df_num["YrSold"] = df_num.YrSold.astype(str)

cluser_avg_sale_price = (
    df_num.groupby(by=["Cluster", "YrSold"]).mean()["SalePrice"].reset_index()
)
cluser_avg_sale_price

Unnamed: 0,Cluster,YrSold,SalePrice
0,0,2006,137272.222222
1,0,2007,143612.500000
2,0,2008,150081.818182
3,0,2009,149225.000000
4,0,2010,116150.000000
...,...,...,...
85,17,2006,158587.444444
86,17,2007,158567.857143
87,17,2008,175400.000000
88,17,2009,166395.454545


In [20]:
fig = px.bar(
    cluser_avg_sale_price,
    x="Cluster",
    y="SalePrice",
    color="YrSold",
    title="Average Sale Price by Cluster and Year Sold",
)
fig.update_layout(barmode="group")
fig.show()

### Add a drop-down widget to the multi-bar chart you created above that lets you choose between 4 numeric fields to represent on the Y axis. 

In [21]:
@interact(Metric=["SalePrice", "LotArea", "OverallQual", "OverallCond"])
def barchart(Metric):

    cluser_avg_metric = (
        df_num.groupby(by=["Cluster", "YrSold"]).mean()[Metric].reset_index()
    )

    fig = px.bar(
        cluser_avg_metric,
        x="Cluster",
        y=Metric,
        color="YrSold",
        title=f"Average {Metric} by Cluster and Year Sold",
    )
    fig.update_layout(barmode="group")
    fig.show()

interactive(children=(Dropdown(description='Metric', options=('SalePrice', 'LotArea', 'OverallQual', 'OverallCâ€¦

### Create a scatter plot that shows the relationship between SalesPrice and LotArea, color-coded by cluster. Add a slider that filters the data by the year the property was sold.

In [22]:
df_num.Cluster = df_num.Cluster.astype(str)
df_num.YrSold = df_num.YrSold.astype(int)

@interact(Year = (df_num.YrSold.min(), df_num.YrSold.max()))
def barchart(Year):
    fig = px.scatter(
        df_num.loc[lambda x: x.YrSold == Year],
        x="SalePrice",
        y="LotArea",
        color="Cluster",
        title=f"Sale Price vs Lot Area for Year {Year}",
    )
    
    fig.show()

interactive(children=(IntSlider(value=2008, description='Year', max=2010, min=2006), Output()), _dom_classes=(â€¦