In [1]:
from clickzetta.zettapark.session import Session

In [2]:
import json
# 从配置文件中读取参数
with open('security/config-uat.json', 'r') as config_file:
    config = json.load(config_file)

# 创建会话
session = Session.builder.configs(config).create()

In [4]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
df = session.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show()

root
 |-- `first_name`: StringType() (nullable = False)
 |-- `middle_name`: StringType() (nullable = False)
 |-- `last_name`: StringType() (nullable = False)
 |-- `dob`: StringType() (nullable = False)
 |-- `gender`: StringType() (nullable = False)
 |-- `salary`: IntegerType() (nullable = False)
------------------------------------------------------------------
|first_name  |middle_name  |last_name  |dob    |gender  |salary  |
------------------------------------------------------------------
|James       |             |Smith      |36636  |M       |60000   |
|Michael     |Rose         |           |40288  |M       |70000   |
|Robert      |             |Williams   |42114  |        |400000  |
|Maria       |Anne         |Jones      |39192  |F       |500000  |
|Jen         |Mary         |Brown      |       |F       |0       |
------------------------------------------------------------------



In [5]:
# Using when otherwise
from clickzetta.zettapark.functions import col, when
df2 = df.withColumn("new_gender", when(col("gender") == "M","Male")
                                 .when(col("gender") == "F","Female")
                                 .otherwise("Unknown"))
df2.show()

df22=df.select(col("*"), when(col("gender") == "M","Male")
      .when(col("gender") == "F","Female")
      .otherwise("Unknown").alias("new_gender")).show()



-------------------------------------------------------------------------------
|first_name  |middle_name  |last_name  |dob    |gender  |salary  |new_gender  |
-------------------------------------------------------------------------------
|James       |             |Smith      |36636  |M       |60000   |Male        |
|Michael     |Rose         |           |40288  |M       |70000   |Male        |
|Robert      |             |Williams   |42114  |        |400000  |Unknown     |
|Maria       |Anne         |Jones      |39192  |F       |500000  |Female      |
|Jen         |Mary         |Brown      |       |F       |0       |Female      |
-------------------------------------------------------------------------------

-------------------------------------------------------------------------------
|first_name  |middle_name  |last_name  |dob    |gender  |salary  |new_gender  |
-------------------------------------------------------------------------------
|James       |             |Smith      

In [6]:
# Using case when
from clickzetta.zettapark.functions import expr
df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " + 
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end"))
df3.show()



-------------------------------------------------------------------------------
|first_name  |middle_name  |last_name  |dob    |gender  |salary  |new_gender  |
-------------------------------------------------------------------------------
|James       |             |Smith      |36636  |M       |60000   |Male        |
|Michael     |Rose         |           |40288  |M       |70000   |Male        |
|Robert      |             |Williams   |42114  |        |400000  |Unknown     |
|Maria       |Anne         |Jones      |39192  |F       |500000  |Female      |
|Jen         |Mary         |Brown      |       |F       |0       |Female      |
-------------------------------------------------------------------------------



In [7]:
#Using case when
df4 = df.select(col("*"), expr("case when gender = 'M' then 'Male' " +
                       "when gender = 'F' then 'Female' " +
                       "else 'Unknown' end").alias("new_gender"))
df4.show()

data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")]
# 创建 DataFrame 
df5 = session.createDataFrame(data=data2, schema=["id", "code", "amt"]) 
# 添加新列 new_column 
df5 = df5.withColumn( "new_column", when((col("code") == "a") | (col("code") == "d"), "A") .when((col("code") == "b") & (col("amt") == "4"), "B") .otherwise("A1") ) 
# 显示结果 
df5.printSchema() 
df5.show()

-------------------------------------------------------------------------------
|first_name  |middle_name  |last_name  |dob    |gender  |salary  |new_gender  |
-------------------------------------------------------------------------------
|James       |             |Smith      |36636  |M       |60000   |Male        |
|Michael     |Rose         |           |40288  |M       |70000   |Male        |
|Robert      |             |Williams   |42114  |        |400000  |Unknown     |
|Maria       |Anne         |Jones      |39192  |F       |500000  |Female      |
|Jen         |Mary         |Brown      |       |F       |0       |Female      |
-------------------------------------------------------------------------------

root
 |-- `id`: IntegerType() (nullable = False)
 |-- `code`: StringType() (nullable = False)
 |-- `amt`: StringType() (nullable = False)
 |-- `new_column`: StringType() (nullable = False)
--------------------------------
|id  |code  |amt  |new_column  |
------------------------

In [8]:
session.close()