# 准备环境

In [None]:
%%bash

if [ $(pip freeze | grep pyalink | wc -l) -gt 0 ]
then
    echo "PyAlink already installed."
else
    IP_LOCATION=$(curl http://ip-api.com/json/?fields=1)
    echo $IP_LOCATION
    echo "Installing PyAlink..."
    if [[ "$IP_LOCATION" == *"China"* ]]
    then
        echo "Use mirror in China"
        pip3 install pyalink -i https://mirrors.aliyun.com/pypi/simple/
    else
        echo "Use default mirror"
        pip3 install pyalink
    fi
    echo "PyAlink installed."
fi

In [None]:
from pyalink.alink import *
resetEnv()
useLocalEnv(1, config=None)

# 数据准备

In [None]:
## prepare data
import numpy as np
import pandas as pd
data = np.array([
    [0, 0.0, 0.0, 0.0],
    [1, 0.1, 0.1, 0.1],
    [2, 0.2, 0.2, 0.2],
    [3, 9, 9, 9],
    [4, 9.1, 9.1, 9.1],
    [5, 9.2, 9.2, 9.2]
])
df = pd.DataFrame({"id": data[:, 0], "f0": data[:, 1], "f1": data[:, 2], "f2": data[:, 3]})
inOp = BatchOperator.fromDataframe(df, schemaStr='id double, f0 double, f1 double, f2 double')
FEATURE_COLS = ["f0", "f1", "f2"]
VECTOR_COL = "vec"
PRED_COL = "pred"

# 数据预处理

In [None]:
vectorAssembler = (
    VectorAssembler()
    .setSelectedCols(FEATURE_COLS)
    .setOutputCol(VECTOR_COL)
)

# 聚类训练

In [None]:
kMeans = (
    KMeans()
    .setVectorCol(VECTOR_COL)
    .setK(2)
    .setPredictionCol(PRED_COL)
)

# 数据预测

In [None]:
pipeline = Pipeline().add(vectorAssembler).add(kMeans)
pipeline.fit(inOp).transform(inOp).firstN(9).collectToDataframe()