In [1]:
from clickzetta.zettapark.session import Session

In [2]:
import json
# 从配置文件中读取参数
with open('security/config-uat.json', 'r') as config_file:
    config = json.load(config_file)

# 创建会话
session = Session.builder.configs(config).create()

In [4]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]

df = session.createDataFrame(data = simpleData, schema = columns)

df.printSchema()
df.show()



root
 |-- `employee_name`: StringType() (nullable = False)
 |-- `department`: StringType() (nullable = False)
 |-- `salary`: IntegerType() (nullable = False)
---------------------------------------
|employee_name  |department  |salary  |
---------------------------------------
|James          |Sales       |3000    |
|Michael        |Sales       |4600    |
|Robert         |Sales       |4100    |
|Maria          |Finance     |3000    |
|James          |Sales       |3000    |
|Scott          |Finance     |3300    |
|Jen            |Finance     |3900    |
|Jeff           |Marketing   |3000    |
|Kumar          |Marketing   |2000    |
|Saif           |Sales       |4100    |
---------------------------------------



In [5]:
from clickzetta.zettapark.window import Window
from clickzetta.zettapark.functions import row_number
windowSpec  = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)) \
    .show()

from clickzetta.zettapark.functions import rank
df.withColumn("rank",rank().over(windowSpec)) \
    .show()

from clickzetta.zettapark.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(windowSpec)) \
    .show()

from clickzetta.zettapark.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(windowSpec)) \
    .show()
    
from clickzetta.zettapark.functions import ntile
df.withColumn("ntile",ntile(2).over(windowSpec)) \
    .show()

from clickzetta.zettapark.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(windowSpec)) \
   .show()

from clickzetta.zettapark.functions import lag    
df.withColumn("lag",lag("salary",2).over(windowSpec)) \
      .show()

from clickzetta.zettapark.functions import lead    
df.withColumn("lead",lead("salary",2).over(windowSpec)) \
    .show()
    
windowSpecAgg  = Window.partitionBy("department")


----------------------------------------------------
|employee_name  |department  |salary  |row_number  |
----------------------------------------------------
|Maria          |Finance     |3000    |1           |
|Scott          |Finance     |3300    |2           |
|Jen            |Finance     |3900    |3           |
|Kumar          |Marketing   |2000    |1           |
|Jeff           |Marketing   |3000    |2           |
|James          |Sales       |3000    |1           |
|James          |Sales       |3000    |2           |
|Robert         |Sales       |4100    |3           |
|Saif           |Sales       |4100    |4           |
|Michael        |Sales       |4600    |5           |
----------------------------------------------------

----------------------------------------------
|employee_name  |department  |salary  |rank  |
----------------------------------------------
|Maria          |Finance     |3000    |1     |
|Scott          |Finance     |3300    |2     |
|Jen            |Finan

In [6]:
from clickzetta.zettapark.functions import col,avg,sum,min,max,row_number 
df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()

---------------------------------------------
|department  |avg     |sum    |min   |max   |
---------------------------------------------
|Finance     |3400.0  |10200  |3000  |3900  |
|Marketing   |2500.0  |5000   |2000  |3000  |
|Sales       |3760.0  |18800  |3000  |4600  |
---------------------------------------------



In [7]:
session.close()