In [1]:
import numpy
import pandas
import joblib

# Instructions

- Read **the train data** from the CSV file and properly set the index
- Use `joblib` to load the trained model and print out the model parameters
- Put the model parameters in a DataFrame named `cluster_centers` with proper indices

In [2]:
data_train = pandas.read_csv('./data/features.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,feature_1,feature_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
253,2.1592,0.0103
667,-5.1157,0.8180
85,1.8528,-0.0038
969,-6.3364,-1.9423
75,-5.8828,-2.6060
...,...,...
835,2.1841,0.0434
192,-8.5461,-8.3446
629,-7.6355,-6.3077
559,2.0941,1.1079


In [3]:
model = joblib.load('model/model.joblib')
model

In [4]:
print(f"model.cluster_centers_:\n {model.cluster_centers_}")

model.cluster_centers_:
 [[-4.84026432  0.107323  ]
 [-8.80518377 -5.36452868]
 [ 1.7568625   0.42275588]]


In [5]:
cluster_centers = pandas.DataFrame(
    model.cluster_centers_,
    columns = ['feature_1', 'feature_2'],
)
cluster_centers.index.name = 'cluster'
cluster_centers

Unnamed: 0_level_0,feature_1,feature_2
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-4.840264,0.107323
1,-8.805184,-5.364529
2,1.756862,0.422756


# Instructions
- Use `.predict` to calculate clusters for all training samples
- Properly put the clusters into `pandas.DataFrame`, with a column named `cluster`

In [6]:
clusters_from_sklearn = model.predict(data_train[['feature_1', 'feature_2']])
clusters_from_sklearn


array([2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 0, 1, 2, 2, 2, 1, 1, 2, 0, 1, 2, 1,
       2, 1, 2, 0, 2, 1, 2, 0, 2, 1, 1, 1, 2, 0, 2, 1, 2, 0, 2, 1, 0, 0,
       1, 2, 0, 1, 2, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 1, 0, 2, 0, 0, 2,
       2, 0, 2, 0, 2, 1, 2, 2, 1, 0, 0, 0, 2, 0, 0, 1, 0, 1, 2, 0, 2, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 2, 0, 1, 1, 1, 2, 1, 0, 2,
       1, 1, 2, 1, 1, 2, 2, 1, 0, 2, 1, 2, 1, 2, 0, 0, 2, 0, 1, 2, 1, 0,
       0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 1, 2, 0, 2, 2, 1, 0, 2, 0, 1, 0, 2,
       0, 0, 0, 1, 2, 2, 1, 0, 1, 1, 1, 0, 0, 2, 2, 1, 0, 2, 1, 2, 1, 1,
       0, 2, 0, 1, 1, 1, 0, 0, 0, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1,
       2, 2, 0, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 0, 2, 0, 2, 2, 1, 2, 1, 0,
       0, 1, 0, 2, 2, 0, 1, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 0, 2,
       1, 1, 1, 1, 1, 1, 0, 2, 0, 0, 2, 2, 1, 0, 1, 2, 1, 2, 1, 2, 2, 0,
       2, 1, 1, 2, 2, 0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 1, 1, 0, 0, 1, 2, 2,
       0, 0, 0, 0, 0, 2, 1, 0, 1, 1, 1, 1, 2, 0, 2,

In [7]:
clusters_from_sklearn = pandas.DataFrame(
    data = clusters_from_sklearn,
    index = data_train.index,
    columns = ['cluster']
)
clusters_from_sklearn

Unnamed: 0_level_0,cluster
id,Unnamed: 1_level_1
253,2
667,0
85,2
969,0
75,0
...,...
835,2
192,1
629,1
559,2


# Instructions
- Write a function that calculate the distances from a given sample point to the cluster centers
  - Hint: use the DataFrame `cluster_centers`, `.apply`
- Use the function to calculate the distances from a sample point at [0., 0.]
  - Also, indicate which cluster the sample belongs to
- Now, **manually calculate** the clusters for all training samples
  - Also, properly put the data into `pandas.DataFrame`, with a column named `cluster`
- Verify that the clusters from `.predict` and manual calculations are equal


In [8]:
def calculate_distances_from_cluster_centers(sample):
    return (
        (sample - cluster_centers)
        .apply(numpy.square)
        .sum(axis='columns')
        .apply(numpy.sqrt)
    )    

In [9]:
calculate_distances_from_cluster_centers([0, 0])


cluster
0     4.841454
1    10.310646
2     1.807011
dtype: float64

In [10]:
distances_from_calculations = (
    data_train[['feature_1', 'feature_2']]
    .apply(calculate_distances_from_cluster_centers, axis='columns')
)
distances_from_calculations

cluster,0,1,2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
253,7.000137,12.210917,0.576190
667,0.762185,7.199719,6.883918
85,6.693987,11.930215,0.437212
969,2.537593,4.219780,8.431749
75,2.906717,4.018682,8.218139
...,...,...,...
835,7.024655,12.247859,0.571352
192,9.228663,2.991312,13.528398
629,6.997561,1.502575,11.554891
559,7.006180,12.676227,0.763644


In [11]:
clusters_from_calculation = distances_from_calculations.idxmin(axis='columns')

In [12]:
clusters_from_calculation = pandas.DataFrame(
    clusters_from_calculation,
    columns = ['cluster']
)
clusters_from_calculation

Unnamed: 0_level_0,cluster
id,Unnamed: 1_level_1
253,2
667,0
85,2
969,0
75,0
...,...
835,2
192,1
629,1
559,2


In [13]:
(clusters_from_sklearn == clusters_from_calculation).all()

cluster    True
dtype: bool