In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

使用BQML检测安全日志中的异常行为

在Colab中运行GitHub仓库中的代码

在GitHub上查看代码

在Vertex AI Workbench中打开笔记本

**_注意_**: 这个笔记本已在以下环境中测试过：

* Python 版本 = 3.9

## 概述

这个Colab笔记本展示了如何使用BigQuery ML来检测Cloud Audit日志中的异常。我们将使用两种不同的预先构建的ML模型进行无监督的异常检测，即K均值聚类和自编码器，以帮助我们识别异常值，例如任何用户身份的不常见API使用。在审计日志中识别异常对于云管理员和运营商来说至关重要，以识别从特权升级到API滥用的潜在威胁。

### 目标

在本教程中，您将学习如何：

* 通过预处理云审计日志应用特征工程技术
* 在云审计日志中使用 BigQuery ML 进行无监督异常检测
* 训练和评估 ML 模型，如 K-means 聚类和自编码器
* 提取和分析异常值

本教程使用以下 Google Cloud ML 服务和资源：

- BigQuery
- 云存储
- 日志分析

### 先决条件
如果您尚未这样做，唯一的要求是[升级您现有的日志存储桶](https://cloud.google.com/logging/docs/buckets#upgrade-bucket)以使用Log Analytics，这为您提供了一个链接的BigQuery数据集，其中包含您自己可查询的日志数据。这是一个**仅需一次点击步骤而不会产生额外费用**。默认情况下，Cloud Audit Admin活动日志已启用，在每个项目的`_Required`存储桶中摄取和存储，而无需任何费用。

![仅需一次点击的先决条件](https://services.google.com/fh/files/misc/upgrade_log_bucket.png)

数据集

在这份笔记中，您将分析您自己的云审计日志，例如管理员活动日志，默认情况下在每个谷歌云项目中启用和存储。与合成数据不同，分析您自己的真实数据将为您提供实际的见解，但结果会有所不同。

费用

本教程使用 Google Cloud 的可计费组件：

* BigQuery

了解 [BigQuery 定价](https://cloud.google.com/bigquery/pricing) 并使用 [定价计算器](https://cloud.google.com/products/calculator/) 根据您的预期使用量生成成本估算。

## 在开始之前

### 设置您的谷歌云项目

**无论您使用的是哪种笔记本环境，都需要执行以下步骤。**

1. [选择或创建一个谷歌云项目](https://console.cloud.google.com/cloud-resource-manager)。当您首次创建帐户时，您将获得300美元的免费信用额用于支付计算/存储成本。

2. [确保为您的项目启用了计费](https://cloud.google.com/billing/docs/how-to/modify-project)。

3. [启用 BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)。

4. 如果您是在本地运行此笔记本，您需要安装[Cloud SDK](https://cloud.google.com/sdk)。

### 设置您的项目ID

**如果您不知道您的项目ID**，请尝试以下操作：
* 运行 `gcloud config list`。
* 运行 `gcloud projects list`。
* 查看支持页面：[查找项目ID](https://support.google.com/googleapi/answer/7014113)。

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}
%env GOOGLE_CLOUD_PROJECT=$PROJECT_ID
!echo project_id = $PROJECT_ID > ~/.bigqueryrc

区域

您还可以更改 Vertex AI 使用的 `REGION` 变量。了解有关 [Vertex AI 区域](https://cloud.google.com/vertex-ai/docs/general/locations) 的更多信息。

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

提供项目、BigQuery数据集和存储审计日志的BigQuery表格。您可以从[日志存储页面](https://console.cloud.google.com/logs/storage)找到日志存储桶的关联BigQuery数据集ID。

In [None]:
logSourceProject = "[your-log-source-project-id]"  # @param {type:"string"} custom
logSourceBqDataset = "[your-log-source-dataset]"  # @param {type:"string"} custom
logSourceBqTable = "[your-log-source-table]"  # @param {type:"string"} custom

这是存储预处理训练数据集的BigQuery数据集和BigQuery表。

In [None]:
BQ_DATASET_NAME = "bqml_approach"  # @param {type:"string"} custom
BQ_TABLE_NAME = "training_data"  # @param {type:"string"} custom

提供BQML模型名称；这些模型将被保存在上述提到的BQ数据集下。

In [None]:
KMEANS_MODEL = "KMEANS_HTUNED"  # @param {type:"string"} custom
AUTO_ENCODER_MODEL = "AUTOENCODER_HTUNED"  # @param {type:"string"} custom

### 验证您的Google Cloud账户

根据您的Jupyter环境，您可能需要手动进行身份验证。请按照以下相关指示进行操作。

1. 顶点 AI 工作台
* 无需操作，因为您已经通过身份验证。

2. 本地JupyterLab实例，取消注释并运行:

In [None]:
# ! gcloud auth login

3.协作，取消注释并运行：

In [None]:
from google.colab import auth

auth.authenticate_user()

### 导入库

In [None]:
import time

from google.cloud import bigquery

训练数据准备和分析

云审计日志包含大量重要信息，但其数量、速度和多样性使得难以进行规模化分析。每个日志条目都有一个相对复杂的架构，这使得在其原始格式下进一步分析更具挑战性。

在运行ML模型之前，您需要从这些日志中提取相关字段，并按**天**、**执行者**、**动作**和**源IP**进行汇总（计数）。由于我们主要关注确定用户异常行为，这些特征都很重要并且集体上足够用于我们的分析。

In [None]:
# This helper function executes the sql query, wait for query execution completion and returns the results as dataframe
def execute_sql(sql_query: str):
    """The executes the sql.
    Args:
        sql_query:(:obj:`str`): SQL query to execute
    """
    from google.cloud import bigquery

    client = bigquery.Client()
    import traceback

    try:
        client = bigquery.Client()
        start = time.time()
        query_job = client.query(sql_query)  # Make an API request.
        print("Query Executed.Waiting for completion")
        results = query_job.result()  # Waits for job to complete.
        end = time.time()
        print("Query Execution completed")
        print("Time taken to execute:", end - start)
        if results.total_rows > 0:
            df = results.to_dataframe()
            df.head()
            return df
    except Exception as e:
        error = traceback.format_exc()
        print(error)
        print(e)
        raise RuntimeError(f"Can't execute the query {sql_query}")

以下的UDF提取了根据审计日志条目执行操作的资源ID。在审计日志条目中，资源ID根据资源类型在不同的资源标签字段中指定。这就是为什么需要这个UDF来规范化资源ID字段。

In [None]:
# Deduce resource ID from a log entry resource field
UDF_NAME = "getResourceId"

sql = """
CREATE OR REPLACE FUNCTION `{}.{}.{}`(
  type STRING,
  labels JSON
)
RETURNS STRING
AS (
 COALESCE(
  JSON_VALUE(labels.email_id),     # service_account
  JSON_VALUE(labels.pod_id),       # container
  JSON_VALUE(labels.instance_id),  # gce_instance, spanner_instance, redis_instance, ...
  JSON_VALUE(labels.subnetwork_id),# gce_subnetwork,
  JSON_VALUE(labels.network_id),   # gce_network, gce_network_region, ...
  JSON_VALUE(labels.topic_id),     # pubsub_topic
  JSON_VALUE(labels.subscription_id), # pubsub_subscription
  JSON_VALUE(labels.endpoint_id),  # aiplatform.googleapis.com/Endpoint
  JSON_VALUE(labels.job_id),       # dataflow_step
  JSON_VALUE(labels.dataset_id),   # bigquery_dataset
  JSON_VALUE(labels.project_id),
  JSON_VALUE(labels.organization_id),
  JSON_VALUE(labels.id),
  "other")
);""".format(
    PROJECT_ID, BQ_DATASET_NAME, UDF_NAME
)

execute_sql(sql)
print(f"Created UDF {PROJECT_ID}.{BQ_DATASET_NAME}.{UDF_NAME}")

以下UDF通过审计日志条目推断用户或系统操作发生的位置。例如，操作可能是通过云控制台、使用gcloud命令行界面，或通过Terraform脚本或其他未知的客户端或渠道发生的。

In [None]:
# Deduce channel from a log entry request user agent
UDF_NAME = "getChannelType"

sql = """CREATE OR REPLACE FUNCTION `{}.{}.{}`(
  caller_supplied_user_agent STRING
)
RETURNS STRING
AS (
  CASE
    WHEN caller_supplied_user_agent LIKE "Mozilla/%" THEN 'Cloud Console'
    WHEN caller_supplied_user_agent LIKE "google-cloud-sdk gcloud/%" THEN 'gcloud CLI'
    WHEN caller_supplied_user_agent LIKE "google-api-go-client/% Terraform/%" THEN 'Terraform'
    ELSE 'other'
  END
);""".format(
    PROJECT_ID, BQ_DATASET_NAME, UDF_NAME
)

execute_sql(sql)
print(f"Created UDF {PROJECT_ID}.{BQ_DATASET_NAME}.{UDF_NAME}")

查询日志来源以提取感兴趣字段的训练数据

In [None]:
# Query to extract training data with fields of interest
query_str = """ SELECT
    EXTRACT(DATE FROM timestamp) AS day,
    IFNULL(proto_payload.audit_log.authentication_info.principal_email, "unknown") as principal_email,
    IFNULL(proto_payload.audit_log.method_name, "unknown") as action,
    IFNULL(resource.type, "unknown") as resource_type,
    {3}.getResourceId(resource.type, resource.labels) AS resource_id,
    -- proto_payload.audit_log.resource_name as resource_name,
    SPLIT(log_name, '/')[SAFE_OFFSET(0)] as container_type,
    SPLIT(log_name, '/')[SAFE_OFFSET(1)] as container_id,
    {3}.getChannelType(proto_payload.audit_log.request_metadata.caller_supplied_user_agent) AS channel,
    IFNULL(proto_payload.audit_log.request_metadata.caller_ip, "unknown") as ip,
    COUNT(*) counter,
    -- ANY_VALUE(resource) as resource,           -- for debugging
    -- ANY_VALUE(proto_payload) as proto_payload  -- for debugging
  FROM  `{0}.{1}.{2}`
  WHERE
    -- log_id = "cloudaudit.googleapis.com/activity" AND
    timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 360 DAY)
  GROUP BY
    day, principal_email, action, resource_type, resource_id, container_type, container_id, channel, ip, log_name
  ORDER BY
    day DESC, principal_email, action""".format(
    logSourceProject, logSourceBqDataset, logSourceBqTable, BQ_DATASET_NAME
)

查看训练数据的数据框

In [None]:
client = bigquery.Client(project=PROJECT_ID)
df = client.query(query_str).to_dataframe()
df.head()

In [None]:
df.info()

在BQ中使用提取的数据创建一张表

In [None]:
create_training_data_table = (
    """ CREATE OR REPLACE TABLE `{}.{}.{}` AS""".format(
        PROJECT_ID, BQ_DATASET_NAME, BQ_TABLE_NAME
    )
    + query_str
)
client.query(create_training_data_table)

K均值聚类

使用训练数据创建K-Means聚类。

模型训练

In [None]:
train_kmeans = """CREATE MODEL IF NOT EXISTS `{0}.{1}`
OPTIONS(MODEL_TYPE = 'KMEANS',
NUM_CLUSTERS = HPARAM_RANGE(2, 10),
KMEANS_INIT_METHOD = 'KMEANS++',
DISTANCE_TYPE = 'COSINE',
STANDARDIZE_FEATURES = TRUE,
MAX_ITERATIONS = 10,
EARLY_STOP = TRUE,
NUM_TRIALS = 10
) AS
SELECT * FROM `{0}.{2}`;""".format(
    BQ_DATASET_NAME, KMEANS_MODEL, BQ_TABLE_NAME
)

In [None]:
execute_sql(train_kmeans)

###模型评估

In [None]:
eval_kmeans = """SELECT * FROM ML.EVALUATE(MODEL `{}.{}`);""".format(
    BQ_DATASET_NAME, KMEANS_MODEL
)
model_evalution = execute_sql(eval_kmeans)
model_evalution

异常值分析

In [None]:
# --- DETECT ANOMALIES --- #
detect_anomaly = """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{0}.{1}.{2}`,
STRUCT(0.001 AS contamination),
TABLE `{0}.{1}.{3}`)
WHERE is_anomaly=true
ORDER BY normalized_distance DESC;""".format(
    PROJECT_ID, BQ_DATASET_NAME, KMEANS_MODEL, BQ_TABLE_NAME
)

kmeans_outliers = execute_sql(detect_anomaly)

In [None]:
kmeans_outliers

## 自动编码器

### 模型训练

In [None]:
train_auto_encoder = """
CREATE MODEL IF NOT EXISTS `{0}.{1}`
OPTIONS(
MODEL_TYPE='autoencoder',
L1_REG_ACTIVATION = HPARAM_CANDIDATES([0.001, 0.01, 0.1]),
LEARN_RATE = HPARAM_CANDIDATES([0.001, 0.01, 0.1]),
OPTIMIZER = HPARAM_CANDIDATES(['ADAGRAD', 'ADAM', 'FTRL', ''RMSPROP', 'SGD']),
ACTIVATION_FN='relu',
BATCH_SIZE = HPARAM_CANDIDATES([16, 32, 64]),
DROPOUT = HPARAM_CANDIDATES([0.1, 0.2]),
HIDDEN_UNITS=HPARAM_CANDIDATES([struct([[16, 8, 4, 8, 16]]), struct([[32, 16, 4, 16, 32]])]),
TF_VERSION = '2.8.0',
EARLY_STOP = TRUE,
MIN_REL_PROGRESS = 0.01,
MAX_ITERATIONS=20,
WARM_START = TRUE,
NUM_TRIALS = 60,
MAX_PARALLEL_TRIALS = 1,
HPARAM_TUNING_ALGORITHM =  'VIZIER_DEFAULT',
HPARAM_TUNING_OBJECTIVES = MEAN_SQUARED_ERROR
) AS
SELECT
*
FROM `{0}.{2}`;""".format(
    BQ_DATASET_NAME, AUTO_ENCODER_MODEL, BQ_TABLE_NAME
)

In [None]:
execute_sql(train_auto_encoder)

### 模型评估

In [None]:
eval_auto_encoder = """SELECT * FROM ML.EVALUATE(MODEL `{}.{}`);""".format(
    BQ_DATASET_NAME, AUTO_ENCODER_MODEL
)
model_evalution = execute_sql(eval_auto_encoder)
model_evalution

异常值分析

In [None]:
# --- DETECT ANOMALIES --- #
detect_anomaly_auto_encoder = """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{0}.{1}.{2}`,
STRUCT(0.001 AS contamination),
TABLE `{0}.{1}.{3}`)
WHERE is_anomaly=true order by mean_squared_error desc;""".format(
    PROJECT_ID, BQ_DATASET_NAME, AUTO_ENCODER_MODEL, BQ_TABLE_NAME
)
# print(detect_anomaly_auto_encoder)
autoencoder_outliers = execute_sql(detect_anomaly_auto_encoder)

In [None]:
autoencoder_outliers

常见的异常值

查找两个模型报告的异常值

In [None]:
df1 = kmeans_outliers[
    [
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ]
]
df2 = autoencoder_outliers[
    [
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ]
]

In [None]:
common_outliers = df1.merge(
    df2,
    how="inner",
    on=[
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ],
)  # Replace 'column_name' if necessary

In [None]:
common_outliers

In [None]:
common_outliers.info()

将检测到的异常值上传至BQ表格进行进一步分析。

创建一个名为'common_outliers'的表

In [None]:
OUTLIERS_TABLE = "[your-common-outliers-table]"  # @param {type:"string"}

In [None]:
from google.cloud import bigquery


def create_table(client, table_id, schema):
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table, exists_ok=True)  # Make an API request
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )


def upload_df_into_bq(client, table_id, df):
    job_config = bigquery.LoadJobConfig(schema=schema)
    job_config.write_disposition = (
        bigquery.WriteDisposition.WRITE_TRUNCATE
    )  # If the table already exists, BigQuery overwrites the data, removes the constraints and uses the schema from the load job.
    job_config.autodetect = False
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    print("Uploaded dataframe into table {}.{}".format(PROJECT_ID, table_id))


schema = [
    bigquery.SchemaField("day", "DATE", mode="REQUIRED"),
    bigquery.SchemaField("principal_email", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("action", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("resource_type", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("resource_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("container_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("container_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("channel", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("ip", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("counter", "INTEGER", mode="REQUIRED"),
]
client = bigquery.Client(PROJECT_ID)

table_id = "{}.{}.{}".format(PROJECT_ID, BQ_DATASET_NAME, OUTLIERS_TABLE)

create_table(client, table_id, schema)

upload_df_into_bq(client, table_id, common_outliers)

整理

要清理此项目中使用的所有Google云资源，您可以删除用于本教程的[Google云项目](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects)。

否则，您可以删除在本教程中创建的各个资源。

In [None]:
# Delete the BigQuery dataset (including the models created & the tables)
dataset_to_be_deleted = "test"  # @param {type:"string"}

In [None]:
!bq rm -r -f {PROJECT_ID}:{dataset_to_be_deleted}