In [1]:
import clickzetta.zettapark
from clickzetta.zettapark.session import Session
from clickzetta.zettapark.functions import split, col, lit

In [2]:
import json
# 从配置文件中读取参数
with open('security/config-uat.json', 'r') as config_file:
    config = json.load(config_file)

# 创建会话
session = Session.builder.configs(config).create()

In [4]:
data=data = [('James','','Smith','1991-04-01'),
  ('Michael','Rose','','2000-05-19'),
  ('Robert','','Williams','1978-09-05'),
  ('Maria','Anne','Jones','1967-12-01'),
  ('Jen','Mary','Brown','1980-02-17')
]

columns=["firstname","middlename","lastname","dob"]
df=session.createDataFrame(data,columns)
df.printSchema()
df.show()

root
 |-- `firstname`: StringType() (nullable = False)
 |-- `middlename`: StringType() (nullable = False)
 |-- `lastname`: StringType() (nullable = False)
 |-- `dob`: StringType() (nullable = False)
--------------------------------------------------
|firstname  |middlename  |lastname  |dob         |
--------------------------------------------------
|James      |            |Smith     |1991-04-01  |
|Michael    |Rose        |          |2000-05-19  |
|Robert     |            |Williams  |1978-09-05  |
|Maria      |Anne        |Jones     |1967-12-01  |
|Jen        |Mary        |Brown     |1980-02-17  |
--------------------------------------------------



In [5]:
df1 = df.withColumn('year', split(df['dob'], lit('-')).getItem(0)) \
       .withColumn('month', split(df['dob'], lit('-')).getItem(1)) \
       .withColumn('day', split(df['dob'], lit('-')).getItem(2))
df1.printSchema()
df1.show()

root
 |-- `firstname`: StringType() (nullable = False)
 |-- `middlename`: StringType() (nullable = False)
 |-- `lastname`: StringType() (nullable = False)
 |-- `dob`: StringType() (nullable = False)
 |-- `year`: StringType() (nullable = True)
 |-- `month`: StringType() (nullable = True)
 |-- `day`: StringType() (nullable = True)
-----------------------------------------------------------------------
|firstname  |middlename  |lastname  |dob         |year  |month  |day  |
-----------------------------------------------------------------------
|James      |            |Smith     |1991-04-01  |1991  |04     |01   |
|Michael    |Rose        |          |2000-05-19  |2000  |05     |19   |
|Robert     |            |Williams  |1978-09-05  |1978  |09     |05   |
|Maria      |Anne        |Jones     |1967-12-01  |1967  |12     |01   |
|Jen        |Mary        |Brown     |1980-02-17  |1980  |02     |17   |
-----------------------------------------------------------------------



In [6]:
# Alternatively we can do like below      
split_col = clickzetta.zettapark.functions.split(df['dob'], lit('-'))
df2 = df.withColumn('year', split_col.getItem(0)) \
       .withColumn('month', split_col.getItem(1)) \
       .withColumn('day', split_col.getItem(2))
df2.show()      

# Using split() function of Column class
split_col =clickzetta.zettapark.functions.split(df['dob'], lit('-'))
df3 = df.select("firstname","middlename","lastname","dob", split_col.getItem(0).alias('year'),split_col.getItem(1).alias('month'),split_col.getItem(2).alias('day'))   
df3.show()

-----------------------------------------------------------------------
|firstname  |middlename  |lastname  |dob         |year  |month  |day  |
-----------------------------------------------------------------------
|James      |            |Smith     |1991-04-01  |1991  |04     |01   |
|Michael    |Rose        |          |2000-05-19  |2000  |05     |19   |
|Robert     |            |Williams  |1978-09-05  |1978  |09     |05   |
|Maria      |Anne        |Jones     |1967-12-01  |1967  |12     |01   |
|Jen        |Mary        |Brown     |1980-02-17  |1980  |02     |17   |
-----------------------------------------------------------------------

-----------------------------------------------------------------------
|firstname  |middlename  |lastname  |dob         |year  |month  |day  |
-----------------------------------------------------------------------
|James      |            |Smith     |1991-04-01  |1991  |04     |01   |
|Michael    |Rose        |          |2000-05-19  |2000  |05    

In [7]:
"""
df4=session.createDataFrame([("20-13-2012-monday",)], ['date',])

df4.select(split(df4.date,'^([\d]+-[\d]+-[\d])').alias('date'),
    regexp_replace(split(df4.date,'^([\d]+-[\d]+-[\d]+)').getItem(1),'-','').alias('day')).show()
    """
df4 = session.createDataFrame([('oneAtwoBthree',)], ['str',])
df4.select(split(df4.str, lit('[AB]')).alias('str')).show()


---------------------------
|str                      |
---------------------------
|['one', 'two', 'three']  |
---------------------------



In [8]:
session.close()