In [1]:
from clickzetta.zettapark.session import Session

In [2]:
import json
# 从配置文件中读取参数
with open('security/config-uat.json', 'r') as config_file:
    config = json.load(config_file)

# 创建会话
session = Session.builder.configs(config).create()

In [4]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

columns = ["firstname","lastname","gender","salary"]
df = session.createDataFrame(data=data, schema = columns)
df.show()


------------------------------------------
|firstname  |lastname  |gender  |salary  |
------------------------------------------
|James      |Smith     |M       |30      |
|Anna       |Rose      |F       |41      |
|Robert     |Williams  |M       |62      |
------------------------------------------



In [5]:
# from clickzetta.zettapark.functions import concat_ws,col,lit
# df.select(concat_ws(",",df.firstname,df.lastname).alias("name"), \
#           df.gender,lit(df.salary*2).alias("new_salary")).show()

# print(df.collect())
# rdd=df.rdd.map(lambda x: 
#     (x[0]+","+x[1],x[2],x[3]*2)
#     )  
# df2=rdd.toDF(["name","gender","new_salary"]   )
# df2.show()


In [6]:
from clickzetta.zettapark.functions import concat_ws, col, lit

# 直接使用 DataFrame 操作
df2 = df.select(
    concat_ws(lit(","), df.firstname, df.lastname).alias("name"),
    df.gender,
    (df.salary * 2).alias("new_salary")
)

# 显示 DataFrame
df2.show()

# 如果你需要收集数据到本地并打印
print(df2.collect())


-----------------------------------------
|name             |gender  |new_salary  |
-----------------------------------------
|James,Smith      |M       |60          |
|Anna,Rose        |F       |82          |
|Robert,Williams  |M       |124         |
-----------------------------------------

[Row(name='James,Smith', gender='M', new_salary=60), Row(name='Anna,Rose', gender='F', new_salary=82), Row(name='Robert,Williams', gender='M', new_salary=124)]


In [7]:
# #Referring Column Names
# rdd2=df.rdd.map(lambda x: 
#     (x["firstname"]+","+x["lastname"],x["gender"],x["salary"]*2)
#     ) 


# #Referring Column Names
# rdd2=df.rdd.map(lambda x: 
#     (x.firstname+","+x.lastname,x.gender,x.salary*2)
#     ) 


# def func1(x):
#     firstName=x.firstname
#     lastName=x.lastName
#     name=firstName+","+lastName
#     gender=x.gender.lower()
#     salary=x.salary*2
#     return (name,gender,salary)

# rdd2=df.rdd.map(lambda x: func1(x))

# #Foeeach example
# def f(x): print(x)
# df.rdd.foreach(f)

# df.rdd.foreach(lambda x: 
#     print("Data ==>"+x["firstname"]+","+x["lastname"]+","+x["gender"]+","+str(x["salary"]*2))
#     ) 
    
# #Iterate collected data
# dataCollect = df.collect()
# for row in dataCollect:
#     print(row['firstname'] + "," +row['lastname'])
    
# #Convert to Pandas and Iterate

# dataCollect=df.rdd.toLocalIterator()
# for row in dataCollect:
#     print(row['firstname'] + "," +row['lastname'])

# import pandas as pd
# pandasDF = df.toPandas()
# for index, row in pandasDF.iterrows():
#     print(row['firstname'], row['gender'])

In [8]:
# 将RDD代码改为DF方式

In [10]:
from clickzetta.zettapark.functions import concat_ws, col, lower

# Referring Column Names without RDD
df2 = df.select(
    concat_ws(lit(","), col("firstname"), col("lastname")).alias("name"),
    col("gender"),
    (col("salary") * 2).alias("new_salary")
)
df2.show()

# 使用 DataFrame API 而不是 RDD
df2 = df.select(
    concat_ws(lit(","), df["firstname"], df["lastname"]).alias("name"),
    df["gender"],
    (df["salary"] * 2).alias("new_salary")
)
df2.show()

# Foreach example without RDD
def f(row):
    print(row)

# 使用 collect 方法获取所有行，并用 for 循环处理
dataCollect = df.collect()
for row in dataCollect:
    f(row)

for row in dataCollect:
    print(
        "Data ==> " + row["firstname"] + "," + row["lastname"] + "," + row["gender"] + "," + str(row["salary"] * 2)
    )

# Iterate collected data without RDD
for row in dataCollect:
    print(row['firstname'] + "," + row['lastname'])


-----------------------------------------
|name             |gender  |new_salary  |
-----------------------------------------
|James,Smith      |M       |60          |
|Anna,Rose        |F       |82          |
|Robert,Williams  |M       |124         |
-----------------------------------------

-----------------------------------------
|name             |gender  |new_salary  |
-----------------------------------------
|James,Smith      |M       |60          |
|Anna,Rose        |F       |82          |
|Robert,Williams  |M       |124         |
-----------------------------------------

Row(firstname='James', lastname='Smith', gender='M', salary=30)
Row(firstname='Anna', lastname='Rose', gender='F', salary=41)
Row(firstname='Robert', lastname='Williams', gender='M', salary=62)
Data ==> James,Smith,M,60
Data ==> Anna,Rose,F,82
Data ==> Robert,Williams,M,124
James,Smith
Anna,Rose
Robert,Williams


In [12]:
import pandas as pd
from clickzetta.zettapark.functions import col

# 收集数据并转换为 Pandas DataFrame
data = df.collect()
pandasDF = pd.DataFrame(data, columns=df.columns)

# 迭代并打印数据
for index, row in pandasDF.iterrows():
    print(row['firstname'], row['gender'])


James M
Anna F
Robert M


In [13]:
session.close()