In [1]:
# !pip install dask[dataframe] --upgrade --quiet
# !pip install dask-ml --quiet
# !pip install aiohttp --quiet
# !pip install joblib --quiet
# !pip install dask distributed --upgrade --quiet
# !pip install distributed>=2.4.0 --quiet
# !pip install -U ipykernel --quiet

## Big Data Day 2 Afternoon Assignment

In this assignment, we will learn about machine learning with Dask. We will use the market basket dataset loaded below and cluster our data.

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [2]:
basket = dd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Data%20Sets%20Big%20Data/Mall_Customers.csv', dtype={"Gender": "category"})

In [16]:
basket.compute()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


To cluster our data, we must first examine and process it. The first step is to check for missing data. Do this in the cell below. If there is missing data, drop all rows containing missing data.

In [17]:
# Answer below:
basket.compute().info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   CustomerID              200 non-null    int64   
 1   Gender                  200 non-null    category
 2   Age                     200 non-null    int64   
 3   Annual Income (k$)      200 non-null    int64   
 4   Spending Score (1-100)  200 non-null    int64   
dtypes: category(1), int64(4)
memory usage: 6.7 KB


In [18]:
basket.dropna().compute().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   CustomerID              200 non-null    int64   
 1   Gender                  200 non-null    category
 2   Age                     200 non-null    int64   
 3   Annual Income (k$)      200 non-null    int64   
 4   Spending Score (1-100)  200 non-null    int64   
dtypes: category(1), int64(4)
memory usage: 8.1 KB


Next, we will get rid of the customer ID column. Drop the column and assign the resulting dataframe to a new variable.

In [38]:
# Answer below:
df = basket.drop(columns='CustomerID')
df

Unnamed: 0_level_0,Gender,Age,Annual Income (k$),Spending Score (1-100)
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,category[unknown],int64,int64,int64
,...,...,...,...


We will now create a dummy variable from the gender variable.

In [39]:
# Answer below:
dummies = dd.get_dummies(data=df.categorize(), columns=['Gender'], drop_first=True)

In [40]:
dummies.compute()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
0,19,15,39,1
1,21,15,81,1
2,20,16,6,0
3,23,16,77,0
4,31,17,40,0
...,...,...,...,...
195,35,120,79,0
196,45,126,28,0
197,32,126,74,1
198,32,137,18,1


Since the data has different scales, we will scale all columns using min max scaling. We will write our own min max scaling function since the minmax scaler will produce numpy arrays instead of dask arrays. Recall that min max scaling requires finding the min and the max. We subtract the min from each observation and divide by the difference between the max and the min. Complete the function below.

In [44]:
df.columns

Index(['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'], dtype='object')

In [38]:
df['Annual Income (k$)'].min().compute()
df['Annual Income (k$)'].max().compute()

137

In [5]:
# Answer below:
def dask_min_max(x):
  max_ = dummies[x].max().compute()
  min_ = dummies[x].min().compute()
  out = (dummies[x] - min_) / (max_ - min_)
  dummies[x] = out
  return dummies

Transform all columns using the scaler you wrote above.

In [6]:
# Answer below:
scaled = dask_min_max(['Annual Income (k$)','Spending Score (1-100)','Age'])


In [7]:
scaled.compute()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Male
0,0.019231,0.000000,0.387755,1
1,0.057692,0.000000,0.816327,1
2,0.038462,0.008197,0.051020,0
3,0.096154,0.008197,0.775510,0
4,0.250000,0.016393,0.397959,0
...,...,...,...,...
195,0.326923,0.860656,0.795918,0
196,0.519231,0.909836,0.275510,0
197,0.269231,0.909836,0.744898,1
198,0.269231,1.000000,0.173469,1


Import the dask clustering function and cluster the data. Use 3 clusters and print the centroids.

In [15]:
!pip install scikit-learn==0.23.2 --quiet

In [9]:
from dask_ml.cluster import KMeans

In [16]:
# Answer below
clust = KMeans(n_clusters=3)

In [46]:
clust.fit(scaled.compute())
clust.cluster_centers_

array([[0.18717949, 0.38870674, 0.65510204, 1.        ],
       [0.38650412, 0.36270492, 0.51557945, 0.        ],
       [0.66234347, 0.38543652, 0.30659706, 1.        ]])

In [47]:
clust.fit(dummies.compute())
clust.cluster_centers_

array([[40.32520325, 44.15447154, 49.82926829,  0.40650407],
       [32.69230769, 86.53846154, 82.12820513,  0.46153846],
       [40.39473684, 87.        , 18.63157895,  0.52631579]])