# 准备环境

In [1]:
# set env
from pyalink.alink import *
resetEnv()
useLocalEnv(1, config=None)


Use one of the following command to start using pyalink:
使用以下一条命令来开始使用 pyalink：
 - useLocalEnv(parallelism, flinkHome=None, config=None)
 - useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.
使用 resetEnv() 来重置运行环境，并切换到另一个。

JVM listening on 127.0.0.1:60201


JavaObject id=o6

# 数据准备

In [2]:
## read data
URL = "http://alink-dataset.cn-hangzhou.oss.aliyun-inc.com/csv/review_rating_train.csv"
SCHEMA_STR = "review_id bigint, rating5 bigint, rating3 bigint, review_context string"
LABEL_COL = "rating5"
TEXT_COL = "review_context"
VECTOR_COL = "vec"
PRED_COL = "pred"
PRED_DETAIL_COL = "predDetail"
source = CsvSourceBatchOp() \
    .setFilePath(URL)\
    .setSchemaStr(SCHEMA_STR)\
    .setFieldDelimiter("_alink_")\
    .setQuoteChar(None)

## Split data for train and test
trainData = SplitBatchOp().setFraction(0.9).linkFrom(source)
testData = trainData.getSideOutput(0)

# 特征工程

In [3]:
pipeline = (
    Pipeline()
    .add(
        Segment()
        .setSelectedCol(TEXT_COL)
    )
    .add(
        StopWordsRemover()
        .setSelectedCol(TEXT_COL)
    ).add(
        DocHashCountVectorizer()
        .setFeatureType("WORD_COUNT")
        .setSelectedCol(TEXT_COL)
        .setOutputCol(VECTOR_COL)
    )
)

# 模型训练

In [4]:
## naiveBayes model
naiveBayes = (
    NaiveBayes()
    .setVectorCol(VECTOR_COL)
    .setLabelCol(LABEL_COL)
    .setPredictionCol(PRED_COL)
    .setPredictionDetailCol(PRED_DETAIL_COL)
)
model = pipeline.add(naiveBayes).fit(trainData)

# 数据预测评估

In [5]:
## evaluation
predict = model.transform(testData)
metrics = (
    EvalMultiClassBatchOp()
    .setLabelCol(LABEL_COL)
    .setPredictionDetailCol(PRED_DETAIL_COL)
    .linkFrom(predict)
    .collectMetrics()
)

# 打印评估结果

In [6]:
print("ConfusionMatrix:", metrics.getConfusionMatrix())
print("LabelArray:", metrics.getLabelArray())
print("LogLoss:", metrics.getLogLoss())
print("Accuracy:", metrics.getAccuracy())
print("Kappa:", metrics.getKappa())
print("MacroF1:", metrics.getMacroF1())
print("Label 1 Accuracy:", metrics.getAccuracy("1"))
print("Label 1 Kappa:", metrics.getKappa("1"))
print("Label 1 Precision:", metrics.getPrecision("1"))


ConfusionMatrix: [[4987, 327, 229, 204, 292], [28, 1223, 164, 147, 108], [1, 1, 269, 10, 11], [0, 0, 0, 10, 0], [0, 2, 1, 2, 83]]
LabelArray: ['5', '4', '3', '2', '1']
LogLoss: 2.330945631084851
Accuracy: 0.8114582047166317
Kappa: 0.6190950197563011
MacroF1: 0.5123859853163818
Label 1 Accuracy: 0.9486356340288925
Label 1 Kappa: 0.27179135595030096
Label 1 Precision: 0.9431818181818182
