In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder.appName("cheatsheet").getOrCreate()

In [4]:
spark

In [5]:
# Loading Data

## from RDD

data = [
    ("1", "Joe", "100000", "1"),
    ("2", "Jen", "110000", "1")
]
columns = ["Id", "Name", "Salary", "DeptId"]

df = spark.sparkContext.parallelize(data).toDF(columns)
df.show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  1| Joe|100000|     1|
|  2| Jen|110000|     1|
+---+----+------+------+



In [11]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)
 |-- DeptId: string (nullable = true)



In [12]:
df.schema

StructType(List(StructField(Id,StringType,true),StructField(Name,StringType,true),StructField(Salary,StringType,true),StructField(DeptId,StringType,true)))

In [13]:
df.count()

2

In [6]:
## createDataFrame

data = [
    ("10", "Ana", "200000", "2"),
    ("12", "Bob", "210000", None)
]
columns = ["Id", "Name", "Salary", "DeptId"]

df2 = spark.createDataFrame(data=data, schema=columns)
df2.show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
| 10| Ana|200000|     2|
| 12| Bob|210000|  null|
+---+----+------+------+



In [39]:
data = [
    (1, "Dev"),
    (2, "HR")
]

schema = "Id INT, Dept STRING"

dept_df = spark.createDataFrame(data=data, schema=schema)
dept_df.show()

+---+----+
| Id|Dept|
+---+----+
|  1| Dev|
|  2|  HR|
+---+----+



In [45]:
df = (df
    .withColumn("id_", F.col("Id").cast(IntegerType()))
    .withColumn("dept_id_", F.col("DeptId").cast(IntegerType()))
    .drop("Id", "DeptId")
    .withColumnRenamed("id_", "Id")
    .withColumnRenamed("dept_id_", "DeptId")
)

df.show()

+----+------+---+------+
|Name|Salary| Id|DeptId|
+----+------+---+------+
| Joe|100000|  1|     1|
| Jen|110000|  2|     1|
+----+------+---+------+



In [47]:
df2 = (df2
    .withColumn("id_", F.col("Id").cast(IntegerType()))
    .withColumn("dept_id_", F.col("DeptId").cast(IntegerType()))
    .drop("Id", "DeptId")
    .withColumnRenamed("id_", "Id")
    .withColumnRenamed("dept_id_", "DeptId")
)

df2.show()

+----+------+---+------+
|Name|Salary| Id|DeptId|
+----+------+---+------+
| Ana|200000| 10|     2|
| Bob|210000| 12|  null|
+----+------+---+------+



In [46]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Salary: string (nullable = true)
 |-- Id: integer (nullable = true)
 |-- DeptId: integer (nullable = true)



In [42]:
df.show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  1| Joe|100000|     1|
|  2| Jen|110000|     1|
+---+----+------+------+



In [48]:
dept_df.show()

+---+----+
| Id|Dept|
+---+----+
|  1| Dev|
|  2|  HR|
+---+----+



In [52]:
(
    df.join(dept_df, df.DeptId == dept_df.Id, how="inner")
    .select(df.Id, df.Name, df.Salary, dept_df.Dept)
    .show()
)

+---+----+------+----+
| Id|Name|Salary|Dept|
+---+----+------+----+
|  1| Joe|100000| Dev|
|  2| Jen|110000| Dev|
+---+----+------+----+



In [55]:
person_df = (
    df
    .union(df2)
    .join(dept_df, df.DeptId == dept_df.Id, how="left")
    .select(df.Id, df.Name, df.Salary, dept_df.Dept)
    .orderBy(F.col("Name"))
)

person_df.show()

+---+----+------+----+
| Id|Name|Salary|Dept|
+---+----+------+----+
| 10| Ana|200000|  HR|
| 12| Bob|210000|null|
|  2| Jen|110000| Dev|
|  1| Joe|100000| Dev|
+---+----+------+----+



In [15]:
df2.agg(*[F.count(c).alias(c) for c in df2.columns]).show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  2|   2|     2|     1|
+---+----+------+------+



In [16]:
def my_count(df):
    return df.agg(*[F.count(c).alias(c) for c in df.columns])

In [17]:
my_count(df).show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  2|   2|     2|     2|
+---+----+------+------+



In [18]:
my_count(df2).show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  2|   2|     2|     1|
+---+----+------+------+



In [20]:
def raise_salary(sal: float, pct: float =0.2)-> float:
    return sal*(1+pct)

In [21]:
raise_salary(100)

120.0

In [22]:
fn_raise_salary = F.udf(raise_salary, FloatType())

In [29]:
df_1 = df.withColumn("NewSalary", fn_raise_salary(F.col("Salary").cast(FloatType())))

In [30]:
df_1.show()

+---+----+------+------+---------+
| Id|Name|Salary|DeptId|NewSalary|
+---+----+------+------+---------+
|  1| Joe|100000|     1| 120000.0|
|  2| Jen|110000|     1| 132000.0|
+---+----+------+------+---------+



In [33]:
spark.udf.register("fn_raise_salary", raise_salary, FloatType())

<function __main__.raise_salary(sal: float, pct: float = 0.2) -> float>

In [34]:
df.createOrReplaceTempView("person")

In [35]:
spark.sql("select * from person").show()

+---+----+------+------+
| Id|Name|Salary|DeptId|
+---+----+------+------+
|  1| Joe|100000|     1|
|  2| Jen|110000|     1|
+---+----+------+------+



In [38]:
spark.sql("select p.*, fn_raise_salary(CAST(p.Salary as FLOAT)) as NewSalary from person p").show()

+---+----+------+------+---------+
| Id|Name|Salary|DeptId|NewSalary|
+---+----+------+------+---------+
|  1| Joe|100000|     1| 120000.0|
|  2| Jen|110000|     1| 132000.0|
+---+----+------+------+---------+



In [18]:
## Save data

df.coalesce(1).write.mode("overwrite").format("json").save("person.json")

In [19]:
!pwd

/home/wengong/projects/py4kids/lesson-17-pyspark/databrick


In [20]:
!ls

apache-spark-resources.xlsx	      pyspark-tmp.ipynb
LearningApacheSpark-CheatSheet.ipynb  spark-cert.README.md
person.json			      spark-summit-2015
pyspark-cheat-sheet.ipynb


In [21]:
!ls person.json/

part-00000-90f0a19a-b2e6-440d-aebe-3d7cf18b6904-c000.json  _SUCCESS


In [22]:
!cat person.json/part-00000-90f0a19a-b2e6-440d-aebe-3d7cf18b6904-c000.json

{"Id":"1","Name":"Joe","Salary":"100000","DeptId":"1"}
{"Id":"2","Name":"Jen","Salary":"110000","DeptId":"1"}


In [23]:
df2.coalesce(1).write.mode("append").format("json").save("person.json")

In [24]:
!ls person.json/

part-00000-263cf6a3-38e8-4c5a-9cc8-8cdee188463e-c000.json  _SUCCESS
part-00000-90f0a19a-b2e6-440d-aebe-3d7cf18b6904-c000.json


In [26]:
!cat person.json/part-00000-90f0a19a-b2e6-440d-aebe-3d7cf18b6904-c000.json

{"Id":"1","Name":"Joe","Salary":"100000","DeptId":"1"}
{"Id":"2","Name":"Jen","Salary":"110000","DeptId":"1"}


In [25]:
!cat person.json/part-00000-263cf6a3-38e8-4c5a-9cc8-8cdee188463e-c000.json

{"Id":"10","Name":"Ana","Salary":"200000","DeptId":"2"}
{"Id":"12","Name":"Bob","Salary":"210000"}


In [27]:
## Read json
df3 = spark.read.json("person.json")

In [28]:
df3.show()

+------+---+----+------+
|DeptId| Id|Name|Salary|
+------+---+----+------+
|     1|  1| Joe|100000|
|     1|  2| Jen|110000|
|     2| 10| Ana|200000|
|  null| 12| Bob|210000|
+------+---+----+------+



In [7]:
df.coalesce(1).write.mode("overwrite").csv("person-csv",header=True)

In [8]:
!ls

apache-spark-resources.xlsx	      pyspark-cheat-sheet.ipynb
LearningApacheSpark-CheatSheet.ipynb  pyspark-tmp.ipynb
person-csv			      spark-cert.README.md
person.csv			      spark-summit-2015
person.json


In [9]:
!ls person-csv/

part-00000-6b313fba-f2ce-4e0a-b861-02f4f935483f-c000.csv  _SUCCESS


In [10]:
!cat person-csv/part-00000-6b313fba-f2ce-4e0a-b861-02f4f935483f-c000.csv

Id,Name,Salary,DeptId
1,Joe,100000,1
2,Jen,110000,1


In [35]:
spark.stop()