In [1]:
from clickzetta.zettapark.session import Session
from clickzetta.zettapark.functions import col,sum,avg,max

In [2]:
import json
# 从配置文件中读取参数
with open('security/config-uat.json', 'r') as config_file:
    config = json.load(config_file)

# 创建会话
session = Session.builder.configs(config).create()

In [3]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NV",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","DE",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","NV",80000,25,18000),
    ("Kumar","Marketing","NJ",91000,50,21000)
  ]


In [4]:
schema = ["employee_name","department","state","salary","age","bonus"]
df = session.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show()

df.groupBy("state").sum("salary").show()

dfGroup=df.groupBy("state") \
          .agg(sum("salary").alias("sum_salary"))
          
dfGroup.show()

dfFilter=dfGroup.filter(dfGroup.sum_salary > 100000)
dfFilter.show()

root
 |-- `employee_name`: StringType() (nullable = False)
 |-- `department`: StringType() (nullable = False)
 |-- `state`: StringType() (nullable = False)
 |-- `salary`: IntegerType() (nullable = False)
 |-- `age`: IntegerType() (nullable = False)
 |-- `bonus`: IntegerType() (nullable = False)
-------------------------------------------------------------
|employee_name  |department  |state  |salary  |age  |bonus  |
-------------------------------------------------------------
|James          |Sales       |NY     |90000   |34   |10000  |
|Michael        |Sales       |NV     |86000   |56   |20000  |
|Robert         |Sales       |CA     |81000   |30   |23000  |
|Maria          |Finance     |CA     |90000   |24   |23000  |
|Raman          |Finance     |DE     |99000   |40   |24000  |
|Scott          |Finance     |NY     |83000   |36   |19000  |
|Jen            |Finance     |NY     |79000   |53   |15000  |
|Jeff           |Marketing   |NV     |80000   |25   |18000  |
|Kumar          |Marke

In [5]:
from clickzetta.zettapark.functions import asc
dfFilter.sort("sum_salary").show()

from clickzetta.zettapark.functions import desc
dfFilter.sort(desc("sum_salary")).show()

df.groupBy("state") \
  .agg(sum("salary").alias("sum_salary")) \
  .filter(col("sum_salary") > 100000)  \
  .sort(desc("sum_salary")) \
  .show()
  
df.createOrReplaceTempView("EMP")
session.sql("select state, sum(salary) as sum_salary from EMP " +
          "group by state having sum_salary > 100000 " + 
          "order by sum_salary desc").show()



----------------------
|state  |sum_salary  |
----------------------
|NV     |166000      |
|CA     |171000      |
|NY     |252000      |
----------------------

----------------------
|state  |sum_salary  |
----------------------
|NY     |252000      |
|CA     |171000      |
|NV     |166000      |
----------------------

----------------------
|state  |sum_salary  |
----------------------
|NY     |252000      |
|CA     |171000      |
|NV     |166000      |
----------------------

----------------------
|state  |sum_salary  |
----------------------
|NY     |252000      |
|CA     |171000      |
|NV     |166000      |
----------------------



In [6]:
session.close()