In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/community/neo4j/graph_paysim.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在Colab中运行
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/master/notebooks/community/neo4j/graph_paysim.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      查看GitHub上的代码
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/laeg/vertex-ai-samples/main/notebooks/community/neo4j/graph_paysim.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">在Vertex AI工作台中打开
    </a>
</td>
</table>

# 概述
在本笔记本中，您将学习如何使用Neo4j AuraDS创建图特征。然后，您将使用这些新特征来解决一个顶点AI的分类问题。

## 数据集
此笔记本使用了经修改以适配Neo4j图数据库的PaySim数据集版本。PaySim是一个合成欺诈数据集。目标是识别给定交易是否构成欺诈。[数据集的原始版本](https://github.com/EdgarLopezPhD/PaySim)包含表格数据。

Neo4j已经完成了一个修改版本，生成了一个图形数据集[此处](https://github.com/voutilad/PaySim)。我们预先生成了一个该数据集的副本，您可以在[此处](https://storage.googleapis.com/neo4j-datasets/paysim.dump)下载。您需要下载该数据集，然后上传到Neo4j AuraDS。AuraDS是一个提供在GCP上服务的图形数据科学工具。有关注册和上传数据集的说明可在[此处](https://github.com/neo4j-partners/aurads-paysim)找到。

##费用
本教程使用谷歌云的收费组件：

* 云存储
* Vertex AI

了解[Vertex AI定价](https://cloud.google.com/vertex-ai/pricing)和[云存储定价](https://cloud.google.com/storage/pricing)，并使用[定价计算器](https://cloud.google.com/products/calculator/)基于您的预期使用量生成成本估算。

# 设置

建立您的开发环境
我们建议您在此笔记本中使用Colab。

## 设置您的Google Cloud项目

**无论您使用的是哪种笔记本环境，以下步骤都是必需的。**

1. [选择或创建一个Google Cloud项目](https://console.cloud.google.com/cloud-resource-manager)。当您第一次创建账户时，您将获得$300的免费信用额度，可用于支付计算/存储成本。

1. [确保为您的项目启用计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

1. [启用Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

1. 如果您在本地运行此笔记本，您需要安装[Cloud SDK](https://cloud.google.com/sdk)。

1. 在下面的单元格中输入您的项目ID。然后运行单元格，以确保Cloud SDK在本笔记本中的所有命令中使用正确的项目。

**注意**：Jupyter会将以`!`为前缀的行视为shell命令，并将以`$`为前缀的Python变量插入这些命令中。

安装额外的软件包
首先，您还需要安装一些软件包。

In [None]:
!pip install --quiet --upgrade graphdatascience==1.0.0

In [None]:
!pip install --quiet google-cloud-storage

In [None]:
!pip install --quiet google.cloud.aiplatform

##（仅限Colab）重新启动内核
在安装了额外的软件包之后，您需要重新启动笔记本内核，以便它可以找到这些软件包。当您运行此操作时，您可能会收到内核崩溃的通知。您可以忽略该通知。

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

# 使用Neo4j进行工作

## 定义与Neo4J相关的变量

您需要在下面输入来自您的AuraDS实例的凭据。您可以按照这个[步骤](https://github.com/neo4j-partners/aurads-paysim)获取您的凭据。

“DB_NAME”始终为AuraDS的neo4j。它与您在AuraDS控制台中给予您的数据库租户名称不同。

In [None]:
DB_URL = "neo4j+s://XXXXX.databases.neo4j.io"
DB_USER = "neo4j"
DB_PASS = "YOUR PASSWORD"
DB_NAME = "neo4j"

在这一部分，我们将连接到Neo4j并查看数据库。我们将使用Neo4j的图数据科学库在数据集中生成一些新特性。最后，我们将加载数据到Pandas的数据框中，以便准备好放入GCP特征存储中。 

探索数据库

In [None]:
import pandas as pd
from graphdatascience import GraphDataScience

In [None]:
# If you are connecting the client to an AuraDS instance, you can get the recommended non-default configuration settings of the Python Driver applied automatically. To achieve this, set the constructor argument aura_ds=True
gds = GraphDataScience(DB_URL, auth=(DB_USER, DB_PASS), aura_ds=True)

In [None]:
gds.set_database(DB_NAME)

现在，让我们稍微探索一下数据库中的数据，以了解我们需要处理的内容。

In [None]:
# node labels
result = gds.run_cypher(
    """
CALL db.labels() YIELD label
CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as freq', {})
YIELD value
RETURN label, value.freq AS freq
"""
)

display(result)

In [None]:
# relationship types
result = gds.run_cypher(
    """
CALL db.relationshipTypes() YIELD relationshipType as type
CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as freq', {})
YIELD value
RETURN type AS relationshipType, value.freq AS freq
ORDER by freq DESC
"""
)

display(result)

In [None]:
# transaction types
result = gds.run_cypher(
    """
      MATCH (t:Transaction)
      WITH sum(t.amount) AS globalSum, count(t) AS globalCnt
      WITH *, 10^3 AS scaleFactor
      UNWIND ['CashIn', 'CashOut', 'Payment', 'Debit', 'Transfer'] AS txType
        CALL apoc.cypher.run('MATCH (t:' + txType + ')
          RETURN sum(t.amount) as txAmount, count(t) AS txCnt', {})
        YIELD value
      RETURN txType,value.txAmount AS TotalMarketValue
    """
)

display(result)

## 使用Neo4j创建一个具有图嵌入的新功能
首先，我们将使用Neo4j图数据科学（GDS）在内存中创建数据的图表示。

请注意，如果您收到一个说图已经存在的错误，那很可能是因为您之前已经运行过这段代码。您可以使用本笔记本中清理部分的命令来销毁它。

In [None]:
# We get a tuple back with an object that represents the graph projection and the results of the GDS call
G, results = gds.graph.project.cypher(
    "client_graph",
    "MATCH (c:Client) RETURN id(c) as id, c.num_transactions as num_transactions, c.total_transaction_amnt as total_transaction_amnt, c.is_fraudster as is_fraudster",
    'MATCH (c:Client)-[:PERFORMED]->(t:Transaction)-[:TO]->(c2:Client) return id(c) as source, id(c2) as target, sum(t.amount) as amount, "TRANSACTED_WITH" as type ',
)

display(results)

现在我们可以从那个图中生成一个嵌入。 这是我们可以在我们的预测中使用的一个新功能。 我们正在使用FastRP，它是Node2Vec的更全面功能和更高性能的版本。 您可以在这里了解更多信息：https://neo4j.com/docs/graph-data-science/current/algorithms/fastrp/。

In [None]:
results = gds.fastRP.mutate(
    G,
    relationshipWeightProperty="amount",
    iterationWeights=[0.0, 1.00, 1.00, 0.80, 0.60],
    featureProperties=["num_transactions", "total_transaction_amnt"],
    propertyRatio=0.25,
    nodeSelfInfluence=0.15,
    embeddingDimension=16,
    randomSeed=1,
    mutateProperty="embedding",
)

display(result)

最后我们把它倒入数据框中

In [None]:
node_properties = gds.graph.streamNodeProperties(
    G, ["embedding", "num_transactions", "total_transaction_amnt", "is_fraudster"]
)

node_properties.head()

现在我们需要把这个数据框调整成更能代表我们分类问题的形式。

In [None]:
x = node_properties.pivot(
    index="nodeId", columns="nodeProperty", values="propertyValue"
)
x = x.reset_index()
x.columns.name = None
x.head()

is_fraudster的值为0或1。如果值为-9223372036854775808，则表示未标记，我们将删除它。

In [None]:
x = x.loc[x["is_fraudster"] != -9223372036854775808]
x.head()

请注意嵌入行是一个数组。为了使该数据集更易消化，我们应该将其展平为多个独立特征：embedding_0、embedding_1、... embedding_n。

In [None]:
FEATURES_FILENAME = "features.csv"

embeddings = pd.DataFrame(x["embedding"].values.tolist()).add_prefix("embedding_")
merged = x.drop(columns=["embedding"]).merge(
    embeddings, left_index=True, right_index=True
)
features_df = merged.drop(
    columns=["is_fraudster", "num_transactions", "total_transaction_amnt"]
)
train_df = merged.drop(columns=["nodeId"])

features_df.to_csv(FEATURES_FILENAME, index=False)

这个数据集太小了，无法在Vertex AI上用于AutoML表格数据。为了示范起见，我们会重复几次。在现实世界中不要这样做。

In [None]:
TRAINING_FILENAME = "train.csv"

pd.concat([train_df for i in range(10)]).to_csv(TRAINING_FILENAME, index=False)

这就是全部！数据框现在有一个好的数据集，我们可以在GCP Vertex AI中使用。

使用Vertex AI与Neo4j数据

## 定义 Google 云变量
您需要为您的 GCP 环境设置一些变量。PROJECT_ID 和 STORAGE_BUCKET 是最关键的。其他变量可能会使用给定的默认值。

In [None]:
# Edit these variables!
PROJECT_ID = "YOUR-PROJECT-ID"
STORAGE_BUCKET = "YOUR-BUCKET-NAME"

# You can leave these defaults
REGION = "us-central1"
STORAGE_PATH = "paysim"
EMBEDDING_DIMENSION = 16
FEATURESTORE_ID = "paysim"
ENTITY_NAME = "payer"

In [None]:
import os

os.environ["GCLOUD_PROJECT"] = PROJECT_ID

验证您的Google Cloud账户

In [None]:
try:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()
except:
    pass

上传到GCP云存储桶

要将数据传送到Vertex AI，首先需要将其放入一个CSV格式的存储桶中。

In [None]:
from google.cloud import storage

client = storage.Client()

In [None]:
bucket = client.bucket(STORAGE_BUCKET)
client.create_bucket(bucket)

In [None]:
# Upload our files to that bucket
for filename in [FEATURES_FILENAME, TRAINING_FILENAME]:
    upload_path = os.path.join(STORAGE_PATH, filename)
    blob = bucket.blob(upload_path)
    blob.upload_from_filename(filename)

## 使用 Vertex AI 训练并部署模型
我们将使用工程化特征来训练一个 AutoML 表格数据模型，然后部署到一个端点。

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

dataset = aiplatform.TabularDataset.create(
    display_name="paysim",
    gcs_source=os.path.join("gs://", STORAGE_BUCKET, STORAGE_PATH, TRAINING_FILENAME),
)
dataset.wait()

print(f'\tDataset: "{dataset.display_name}"')
print(f'\tname: "{dataset.resource_name}"')

In [None]:
embedding_column_names = ["embedding_{}".format(i) for i in range(EMBEDDING_DIMENSION)]
other_column_names = ["num_transactions", "total_transaction_amnt"]
all_columns = other_column_names + embedding_column_names
column_specs = {column: "numeric" for column in all_columns}

job = aiplatform.AutoMLTabularTrainingJob(
    display_name="train-paysim-automl-1",
    optimization_prediction_type="classification",
    column_specs=column_specs,
)

In [None]:
model = job.run(
    dataset=dataset,
    target_column="is_fraudster",
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    model_display_name="paysim-prediction-model",
    disable_early_stopping=False,
    budget_milli_node_hours=1000,
)

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

## 将数据加载到Vertex AI特征商店
在本节中，我们将把具有新工程特征的数据帧加载到Vertex AI特征商店中。

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreServiceClient

api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
fs_client = FeaturestoreServiceClient(client_options={"api_endpoint": api_endpoint})

resource_path = fs_client.common_location_path(PROJECT_ID, REGION)
fs_path = fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID)
entity_path = fs_client.entity_type_path(
    PROJECT_ID, REGION, FEATURESTORE_ID, ENTITY_NAME
)

首先，让我们检查一下特征存储是否已经存在。

In [None]:
from grpc import StatusCode


def check_has_resource(callable):
    has_resource = False
    try:
        callable()
        has_resource = True
    except Exception as e:
        if (
            not hasattr(e, "grpc_status_code")
            or e.grpc_status_code != StatusCode.NOT_FOUND
        ):
            raise e
    return has_resource

In [None]:
feature_store_exists = check_has_resource(
    lambda: fs_client.get_featurestore(name=fs_path)
)

In [None]:
from google.cloud.aiplatform_v1.types import entity_type as entity_type_pb2
from google.cloud.aiplatform_v1.types import feature as feature_pb2
from google.cloud.aiplatform_v1.types import featurestore as featurestore_pb2
from google.cloud.aiplatform_v1.types import \
    featurestore_service as featurestore_service_pb2
from google.cloud.aiplatform_v1.types import io as io_pb2

if not feature_store_exists:
    create_lro = fs_client.create_featurestore(
        featurestore_service_pb2.CreateFeaturestoreRequest(
            parent=resource_path,
            featurestore_id=FEATURESTORE_ID,
            featurestore=featurestore_pb2.Featurestore(
                online_serving_config=featurestore_pb2.Featurestore.OnlineServingConfig(
                    fixed_node_count=1
                ),
            ),
        )
    )

    print(create_lro.result())

In [None]:
entity_type_exists = check_has_resource(
    lambda: fs_client.get_entity_type(name=entity_path)
)

if not entity_type_exists:
    users_entity_type_lro = fs_client.create_entity_type(
        featurestore_service_pb2.CreateEntityTypeRequest(
            parent=fs_path,
            entity_type_id=ENTITY_NAME,
            entity_type=entity_type_pb2.EntityType(
                description="Main entity type",
            ),
        )
    )
    print(users_entity_type_lro.result())

    feature_requests = [
        featurestore_service_pb2.CreateFeatureRequest(
            feature=feature_pb2.Feature(
                value_type=feature_pb2.Feature.ValueType.DOUBLE,
                description="Embedding {} from Neo4j".format(i),
            ),
            feature_id="embedding_{}".format(i),
        )
        for i in range(EMBEDDING_DIMENSION)
    ]
    create_features_lro = fs_client.batch_create_features(
        parent=entity_path,
        requests=feature_requests,
    )
    print(create_features_lro.result())

In [None]:
feature_specs = [
    featurestore_service_pb2.ImportFeatureValuesRequest.FeatureSpec(
        id="embedding_{}".format(i)
    )
    for i in range(EMBEDDING_DIMENSION)
]

from google.protobuf.timestamp_pb2 import Timestamp

feature_time = Timestamp()
feature_time.GetCurrentTime()
feature_time.nanos = 0

import_request = fs_client.import_feature_values(
    featurestore_service_pb2.ImportFeatureValuesRequest(
        entity_type=entity_path,
        csv_source=io_pb2.CsvSource(
            gcs_source=io_pb2.GcsSource(
                uris=[
                    os.path.join(
                        "gs://", STORAGE_BUCKET, STORAGE_PATH, FEATURES_FILENAME
                    )
                ]
            )
        ),
        entity_id_field="nodeId",
        feature_specs=feature_specs,
        worker_count=1,
        feature_time=feature_time,
    )
)

print(import_request.result())

使用特征存储中的特征发送预测。

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreOnlineServingServiceClient

data_client = FeaturestoreOnlineServingServiceClient(
    client_options={"api_endpoint": api_endpoint}
)

In [None]:
# Retrieve Neo4j embeddings from feature store
from google.cloud.aiplatform_v1.types import FeatureSelector, IdMatcher
from google.cloud.aiplatform_v1.types import \
    featurestore_online_service as featurestore_online_service_pb2

feature_selector = FeatureSelector(
    id_matcher=IdMatcher(
        ids=["embedding_{}".format(i) for i in range(EMBEDDING_DIMENSION)]
    )
)

fs_features = data_client.read_feature_values(
    featurestore_online_service_pb2.ReadFeatureValuesRequest(
        entity_type=entity_path,
        entity_id="5",
        feature_selector=feature_selector,
    )
)

saved_embeddings = dict(
    zip(
        (fd.id for fd in fs_features.header.feature_descriptors),
        (str(d.value.double_value) for d in fs_features.entity_view.data),
    )
)

In [None]:
# Combine with other features. These might be sourced per transaction
all_features = {"num_transactions": "80", "total_dollar_amnt": "7484459.618641878"}

all_features.update(saved_embeddings)

instances = [{key: str(value) for key, value in all_features.items()}]

In [None]:
# Send a prediction
endpoint.predict(instances=instances)

清理

## Neo4j 清理

要删除图数据科学表示的图，请运行以下命令：

In [None]:
gds.graph.drop(G)

## Google Cloud 清理

删除特征存储库并关闭端点

In [None]:
fs_client.delete_featurestore(
    request=featurestore_service_pb2.DeleteFeaturestoreRequest(
        name=fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID),
        force=True,
    )
).result()

endpoint.delete()