# Loading of data needed for examples

This notebook can be used to load the data into Lakehouse that is used for some of the demos.

It will create tables in the database and schema used in the connection parameters ie config.json

### Install Zettapark
If installed, please pass this step.

In [1]:
#!pip install -U clickzetta-zettapark-python -i https://pypi.tuna.tsinghua.edu.cn/simple

In [2]:
# Zettapark modules
from clickzetta.zettapark.session import Session
import clickzetta.zettapark.types as T
import logging
logging.getLogger("clickzetta.zettapark").setLevel(logging.ERROR)

In [3]:
# Get a nicer output from .show()
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Connect to Lakehouse

This example is using the config.json file to connect to Lakehouse. 

In [4]:
import json

# 从配置文件中读取参数
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

print("Connecting to Lakehouse.....\n")

# 创建会话
session = Session.builder.configs(config).create()

print("Connected and context as below...\n")

# print(session.sql("SELECT current_instance_id(), current_workspace(),current_workspace_id(), current_schema(), current_user(),current_user_id(), current_vcluster()").collect())

Connecting to Lakehouse.....

Connected and context as below...



In [5]:
data_path = "./data/" # WHere the data files is stored localy
data_volume_name = "user" # Using the user stage
DATABASE_NAME = config['workspace']# Name of Database 
DATABASE_SCHEMA = config['schema'] # Name of schema to store data and files in 
FULLY_QUALIFIED_NAME = f"{DATABASE_NAME}.{DATABASE_SCHEMA}"

Create the schema if they do not already exists

In [6]:
session.get_current_schema()

'CREDIT_SCORING_SCH'

### Titanic data

In [7]:
# Upload the source file to user volume
putResult = session.file.put(f"{data_path}titanic.csv", f"volume:{data_volume_name}://~/csv/", auto_compress=True, overwrite=True)

In [8]:
session.sql("LIST USER VOLUME").show(100)

+--------------------+--------------------+-------+--------------------+
|       relative_path|                 url|   size|  last_modified_time|
+--------------------+--------------------+-------+--------------------+
|     csv/titanic.csv|oss://cz-lh-sh-pr...| 117728|2025-03-18 19:30:...|
|nyc/greentaxis/pa...|oss://cz-lh-sh-pr...|1673841|2025-03-10 14:56:...|
+--------------------+--------------------+-------+--------------------+



In [9]:
titanicSchema = T.StructType(
    [
        T.StructField("PCLASS", T.IntegerType()),
        T.StructField("SURVIVED", T.StringType()),
        T.StructField("NAME", T.StringType()),
        T.StructField("SEX", T.StringType()),
        T.StructField("AGE", T.FloatType()),
        T.StructField("SIBSP", T.FloatType()),
        T.StructField("PARCH", T.FloatType()),
        T.StructField("TICKET", T.StringType()),
        T.StructField("FARE", T.FloatType()),
        T.StructField("CABIN", T.StringType()),
        T.StructField("EMBARKED", T.StringType()),
        T.StructField("BOAT", T.StringType()),
        T.StructField("BODY", T.IntegerType()),
        T.StructField("HOME_DEST", T.StringType()),
    ]
)

# Crete a reader
dfReader = session.read.schema(titanicSchema)

# Get the data into the data frame
dfTitanic_volume = dfReader.options({"field_delimiter":",", "FIELD_OPTIONALLY_ENCLOSED_BY":'"', "NULL_IF":"?", "PARSE_HEADER": True,"SKIP_HEADER":1}).csv(f"volume:{data_volume_name}://~/csv/titanic.csv")
session.sql(f"DROP TABLE IF EXISTS {FULLY_QUALIFIED_NAME}.titanic").collect()
dfTitanic_volume.write.save_as_table(f"{FULLY_QUALIFIED_NAME}.titanic",mode="overwrite")

In [10]:
session.sql(f"delete from {FULLY_QUALIFIED_NAME}.titanic where pclass is NULL").show()
session.table(f"{FULLY_QUALIFIED_NAME}.titanic").show()

+-----------------+
|   result_message|
+-----------------+
|OPERATION SUCCEED|
+-----------------+

+------+--------+--------------------+------+------+-----+-----+--------+--------+-------+--------+----+----+--------------------+
|pclass|survived|                name|   sex|   age|sibsp|parch|  ticket|    fare|  cabin|embarked|boat|body|           home_dest|
+------+--------+--------------------+------+------+-----+-----+--------+--------+-------+--------+----+----+--------------------+
|     1|       1|Allen, Miss. Elis...|female|    29|    0|    0|   24160|211.3375|     B5|       S|   2|NULL|        St Louis, MO|
|     1|       1|Allison, Master. ...|  male|0.9167|    1|    2|  113781|  151.55|C22 C26|       S|  11|NULL|Montreal, PQ / Ch...|
|     1|       0|Allison, Miss. He...|female|     2|    1|    2|  113781|  151.55|C22 C26|       S|   ?|NULL|Montreal, PQ / Ch...|
|     1|       0|Allison, Mr. Huds...|  male|    30|    1|    2|  113781|  151.55|C22 C26|       S|   ?| 135|Mont

In [11]:
session.table(f"{FULLY_QUALIFIED_NAME}.titanic").count()

1309

### Campaign spend

In [12]:
# Upload the source file to the stage
session.file.put(f"{data_path}campaign_spend.csv", f"volume:{data_volume_name}://~/csv/", auto_compress=True, overwrite=True)

[PutResult(source='data/campaign_spend.csv', target='/csv/campaign_spend.csv', source_size=13684943, target_size=13684943)]

In [13]:
campaignSchema = T.StructType(
    [
        T.StructField("CAMPAIGN", T.StringType()),
        T.StructField("CHANNEL", T.StringType()),
        T.StructField("DATE", T.DateType()),
        T.StructField("TOTAL_CLICKS", T.DecimalType(38,0)),
        T.StructField("TOTAL_COST", T.DecimalType(38,0)),
        T.StructField("ADS_SERVED", T.DecimalType(38,0)),
    ]
)

# Crete a reader
dfReader = session.read.schema(campaignSchema)

# Get the data into the data frame
dfCampaign_volume = dfReader.options({"field_delimiter":",", "SKIP_HEADER":1}).csv(f"volume:{data_volume_name}://~/csv/campaign_spend.csv")
session.sql(f"DROP TABLE IF EXISTS {FULLY_QUALIFIED_NAME}.campaign_spend").collect()
dfCampaign_volume.write.save_as_table(f"{FULLY_QUALIFIED_NAME}.campaign_spend")

In [14]:
session.sql(f"delete from {FULLY_QUALIFIED_NAME}.campaign_spend where date is NULL").show()
session.table(f"{FULLY_QUALIFIED_NAME}.campaign_spend").show()

+-----------------+
|   result_message|
+-----------------+
|OPERATION SUCCEED|
+-----------------+

+--------------------+-------------+----------+------------+----------+----------+
|            campaign|      channel|      date|total_clicks|total_cost|ads_served|
+--------------------+-------------+----------+------------+----------+----------+
|       winter_sports|        video|2012-06-03|         213|      1762|       426|
|sports_across_cul...|        video|2012-06-02|          87|       678|       157|
|  building_community|search_engine|2012-06-03|          66|       471|       134|
|        world_series| social_media|2017-12-28|          72|       591|       149|
|       winter_sports|        email|2018-02-09|         252|      1841|       473|
|        spring_break|        video|2017-11-14|         162|      1155|       304|
|          nba_finals|        email|2017-11-22|          68|       480|       134|
|       winter_sports| social_media|2018-03-10|         227|      179

In [15]:
session.table(f"{FULLY_QUALIFIED_NAME}.campaign_spend").count()

293120

### Bank Marketing files

Put all snappy.parquet files to user volume.

In [16]:
session.file.put(f"{data_path}/bank/*.parquet", f"volume:{data_volume_name}://~/csv/bank/", auto_compress=False, overwrite=True)

[PutResult(source='data/bank/data_aug.snappy.parquet', target='/csv/bank/data_aug.snappy.parquet', source_size=62202, target_size=62202),
 PutResult(source='data/bank/data_oct.snappy.parquet', target='/csv/bank/data_oct.snappy.parquet', source_size=16217, target_size=16217),
 PutResult(source='data/bank/data_dec.snappy.parquet', target='/csv/bank/data_dec.snappy.parquet', source_size=8559, target_size=8559),
 PutResult(source='data/bank/data_feb.snappy.parquet', target='/csv/bank/data_feb.snappy.parquet', source_size=36471, target_size=36471),
 PutResult(source='data/bank/data_jan.snappy.parquet', target='/csv/bank/data_jan.snappy.parquet', source_size=22809, target_size=22809),
 PutResult(source='data/bank/data_nov.snappy.parquet', target='/csv/bank/data_nov.snappy.parquet', source_size=51065, target_size=51065),
 PutResult(source='data/bank/data_sep.snappy.parquet', target='/csv/bank/data_sep.snappy.parquet', source_size=14360, target_size=14360),
 PutResult(source='data/bank/data_ma

In [26]:
import re
file_list = session.sql("LIST USER VOLUME").collect()
if file_list:  
    first_row = file_list[0]
    file_dir = first_row["url"]  # get the 1st file location
    file_dir = re.sub(r'data_[a-zA-Z0-9_]+', '.*', file_dir)
    print(file_dir)

oss://cz-lh-sh-prod/123/workspaces/qiliang_ws_demo_3539727324115904866/internal_volume/user_2162629/csv/bank/.*.snappy.parquet


In [27]:
session.sql(f"""
            SELECT * FROM user VOLUME
            USING parquet 
            REGEXP '{file_dir}'
            ;
""").show()

+---+------------+--------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+-------+---+
|age|         job| marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|outcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+-------+---+
| 32|  technician|  single| tertiary|     no|    392|    yes|  no|cellular|  1|  apr|     957|       2|  131|       2|failure| no|
| 26|      admin.|  single|secondary|     no|    274|     no|  no|cellular|  1|  apr|     351|       3|   -1|       0|unknown|yes|
| 36|entrepreneur|  single|secondary|     no|     45|    yes|  no|cellular|  1|  apr|     131|       1|   -1|       0|unknown| no|
| 35|    services|divorced|secondary|     no|   -121|    yes| yes|cellular|  1|  apr|      73|       1|   -1|       0|unknown| no|
| 46|  management|divorced| tertiary|     no|  10469|    yes|  no|cellular|  1|  ap

### Clean up

In [None]:
session.sql(f"remove {data_volume_name} volume FILE 'csv/titanic.csv'").collect()
session.sql(f"remove {data_volume_name} volume FILE 'csv/campaign_spend.csv'").collect()
session.sql(f"remove {data_volume_name} volume SUBDIRECTORY 'csv/bank'").collect()
session.sql(f"list {data_volume_name} volume").show(100)

In [None]:
session.close()