# 将本地文件通过Zettapark Put到云器Lakehouse管理的数据湖（Volume）

In [1]:
# !pip install clickzetta_zettapark_python  -i https://pypi.tuna.tsinghua.edu.cn/simple

In [31]:
from clickzetta.zettapark.session import Session
import json,requests
import os
from datetime import datetime

## 创建到云器Lakehouse的会话

In [None]:
import json

# 从配置文件中读取参数
with open('config/config-ingest.json', 'r') as config_file:
    config = json.load(config_file)

print("正在连接到云器Lakehouse.....\n")

# 创建会话
session = Session.builder.configs(config).create()

print("连接成功！...\n")

正在连接到云器Lakehouse.....

连接成功！...



## 创建云器Lakehouse数据湖Connection和Volume

### 创建数据湖Connection,到数据湖的连接

In [None]:
# -- CREATE Datalake Connection
datalake_connection_sql = """
CREATE STORAGE CONNECTION if not exists hz_ingestion_demo
    TYPE oss
    ENDPOINT = 'your endpoint'
    access_id = 'your access_id'
    access_key = 'your access_key'
    comments = 'oss private endpoint for ingest demo'
"""

In [34]:
session.sql(datalake_connection_sql).show()

---------------------
|result_message     |
---------------------
|OPERATION SUCCEED  |
---------------------



In [None]:
session.sql("show connections").show()

### 创建Volume,数据湖存储文件的位置

In [36]:
# -- CREATE Datalake Volumes
datalake_sql = """
CREATE EXTERNAL VOLUME  if not exists ingest_demo
  LOCATION 'oss://czsampledatahz/ingest_demo' 
  USING connection hz_ingestion_demo  -- storage Connection
  DIRECTORY = (
    enable = TRUE
  ) 
  recursive = TRUE
"""

In [37]:
session.sql(datalake_sql).show()

---------------------
|result_message     |
---------------------
|OPERATION SUCCEED  |
---------------------



## 同步数据湖Volume的目录到Lakehouse

In [38]:
alter_datalake_sql = """
alter volume ingest_demo refresh
"""

In [39]:
session.sql(alter_datalake_sql).show()

---------------------
|result_message     |
---------------------
|OPERATION SUCCEED  |
---------------------



## 查看云器Lakehouse数据湖Volume上的文件，还没有任何文件

In [None]:
results = session.sql("select * from directory(volume ingest_demo)").show()


## 将文件PUT到云器Lakehouse数据湖Volume

In [41]:
for filename in os.listdir("data/"):
        if filename.endswith(".gz"):
            file_path = os.path.join("data/", filename)
            session.file.put(file_path,"volume://ingest_demo/gz/")
        if filename.endswith(".csv"):
            file_path = os.path.join("data/", filename)
            session.file.put(file_path,"volume://ingest_demo/csv/")
        if filename.endswith(".json"):
            file_path = os.path.join("data/", filename)
            session.file.put(file_path,"volume://ingest_demo/json/")

In [42]:
# 或者上传目录下所有的文件
# session.file.put("../data/","volume://ingest_demo/gz/")

## 再次同步数据湖Volume的目录到Lakehouse

In [43]:
session.sql(alter_datalake_sql).show()

---------------------
|result_message     |
---------------------
|OPERATION SUCCEED  |
---------------------



## 再次查看云器Lakehouse数据湖Volume上的文件，数据入湖成功了

In [44]:
results = session.sql("select * from directory(volume ingest_demo)").show()


----------------------------------------------------------------------------------------------------------------------------------
|relative_path                       |url                                                 |size      |last_modified_time         |
----------------------------------------------------------------------------------------------------------------------------------
|csv/lift_tickets_data.csv           |oss://czsampledatahz/ingest_demo/csv/lift_ticke...  |21101094  |2024-12-30 16:52:51+08:00  |
|gz/lift_tickets_data.csv.gz         |oss://czsampledatahz/ingest_demo/gz/lift_ticket...  |9717050   |2024-12-30 16:52:48+08:00  |
|gz/lift_tickets_data.json.gz        |oss://czsampledatahz/ingest_demo/gz/lift_ticket...  |11146044  |2024-12-30 16:52:47+08:00  |
|gz/parquet_files/part00001.parquet  |oss://czsampledatahz/ingest_demo/gz/parquet_fil...  |12660     |2024-12-28 16:30:17+08:00  |
|gz/parquet_files/part00002.parquet  |oss://czsampledatahz/ingest_demo/gz/parquet_f

## 数据校验，检查文件里的行数

In [45]:
datalake_data_verify_sql = """
select count() from volume ingest_demo (txid string) using csv
 options(
    'header'='true',
    'sep'=',',
    'compression' = 'gzip'
 ) files('gz/lift_tickets_data.csv.gz')
 limit 10
"""

In [46]:
session.sql(datalake_data_verify_sql).show()

-------------
|`count`()  |
-------------
|100000     |
-------------



## 数据湖分析

In [47]:
datalake_data_analytics_sql = """
select * from volume ingest_demo (txid string,name string, address_state string) using csv
 options(
    'header'='true',
    'sep'=',',
    'compression' = 'gzip'
 ) files('gz/lift_tickets_data.csv.gz')
 limit 10
"""

In [48]:
session.sql(datalake_data_analytics_sql).show()

-------------------------------------------------------------------------------------
|txid                                  |name                        |address_state  |
-------------------------------------------------------------------------------------
|80a7a77b-4941-46f3-bf1a-760bb46f12da  |0xbb6eabaf2eb3c3d2ea164eba  |新荣记            |
|976b4512-1b07-43f4-a8e4-1fe86a7e1ee4  |0xa08ab7945cf87fc0b5095dc   |大董烤鸭           |
|4c49f5cc-0bd4-4a7e-8f61-f4a501a0dd24  |0xdf7bd805b890815a4e0a008c  |京雅堂            |
|8579071f-1c8b-4214-9a4d-096e6403bc52  |0x3113aa5ae86c522f3176829e  |新大陆中餐厅         |
|31962471-ad3b-463d-ab36-d1b1ab041a36  |0x28c6168f44e09cacd82ecfe9  |顺峰海鲜酒家         |
|f253d271-092d-4261-8703-a440cc149c39  |0xab306bea9de6a13426361153  |长安壹号           |
|5e52e443-2c03-4ce2-a95d-992d7cb3f54e  |0x52000c48116d3a4667c3b607  |御宝轩            |
|e45f3806-972c-4617-b4ab-f2cbfc449de1  |0x247dd8c03cab559125a63d1b  |店客店来           |
|9abeadfa-ecac-42fb-9dd7-33377e2e5387  |0x9824bf4d4f7e

## 测试将数据湖上文件再拉回到本地

In [16]:
session.file.get("volume://ingest_demo/gz/lift_tickets_data.json.gz","tmp/gz/")

[GetResult(file='tmp/gz/lift_tickets_data.json.gz', size=11146044, status='DOWNLOADED', message='')]

In [49]:
session.close()