### [Spark Developer Certification - Comprehensive Study Guide](https://github.com/mdrakiburrahman/databricks-certification)

~/spark/databrick-cert/databricks-certification/Comprehensive_study_guide_for_Spark_Developer_Certification.html

In [2]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import *
from pyspark import StorageLevel
import sys

In [3]:
from IPython.display import display

In [5]:
spark = SparkSession.builder.getOrCreate()

In [8]:
spark

In [6]:
list_df = spark.createDataFrame([1, 2, 3, 4], IntegerType())

In [7]:
list_df.show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
+-----+



In [23]:
# Create Example Data - Departments and Employees

# Create the Employees
Employee = Row("name","gender") # Define the Row `Employee' with one column/key
employee1 = Employee('Bob',"M") # Define against the Row 'Employee'
employee2 = Employee('Sam',"M") # Define against the Row 'Employee'
employee3 = Employee('Jane',"F") # Define against the Row 'Employee'

# Create the Departments
Department = Row("name", "department") # Define the Row `Department' with two columns/keys
department1 = Department('Bob', 'Accounts') # Define against the Row 'Department'
department2 = Department('Alice', 'Sales') # Define against the Row 'Department'
department3 = Department('Sam', 'HR') # Define against the Row 'Department'

# Create DataFrames from rows
employeeDF = spark.createDataFrame([employee1, employee2]) 
departmentDF = spark.createDataFrame([department1, department2, department3])

In [30]:
joinExpression = employeeDF["name"] == departmentDF["name"]

In [31]:
employeeDF.join(departmentDF, joinExpression, how="inner").show()

+----+------+----+----------+
|name|gender|name|department|
+----+------+----+----------+
| Bob|     M| Bob|  Accounts|
| Sam|     M| Sam|        HR|
+----+------+----+----------+



In [32]:
employeeDF.join(departmentDF, joinExpression, how="left_outer").show()

+----+------+----+----------+
|name|gender|name|department|
+----+------+----+----------+
| Bob|     M| Bob|  Accounts|
| Sam|     M| Sam|        HR|
+----+------+----+----------+



In [33]:
employeeDF.join(departmentDF, joinExpression, how="left_semi").show()

+----+------+
|name|gender|
+----+------+
| Bob|     M|
| Sam|     M|
+----+------+



In [34]:
employeeDF.join(departmentDF, joinExpression, how="left_anti").show()

+----+------+
|name|gender|
+----+------+
+----+------+



In [35]:
employeeDF.join(departmentDF, joinExpression, how="right_outer").show()

+----+------+-----+----------+
|name|gender| name|department|
+----+------+-----+----------+
| Bob|     M|  Bob|  Accounts|
| Sam|     M|  Sam|        HR|
|null|  null|Alice|     Sales|
+----+------+-----+----------+



In [36]:
employeeDF.join(departmentDF, joinExpression, how="cross").show()

+----+------+----+----------+
|name|gender|name|department|
+----+------+----+----------+
| Bob|     M| Bob|  Accounts|
| Sam|     M| Sam|        HR|
+----+------+----+----------+



In [37]:
employeeDF.join(departmentDF, joinExpression, how="outer").show()

+----+------+-----+----------+
|name|gender| name|department|
+----+------+-----+----------+
| Bob|     M|  Bob|  Accounts|
| Sam|     M|  Sam|        HR|
|null|  null|Alice|     Sales|
+----+------+-----+----------+



In [39]:
employeeDF.join(departmentDF, joinExpression, how="full").show()

+----+------+-----+----------+
|name|gender| name|department|
+----+------+-----+----------+
| Bob|     M|  Bob|  Accounts|
| Sam|     M|  Sam|        HR|
|null|  null|Alice|     Sales|
+----+------+-----+----------+



In [40]:
schema = StructType(
    [
        StructField("letter", StringType(), True),
        StructField("position", IntegerType(), True),
    ]
)

data = [("A", 1), ("B", 2), ("C", 3)]
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+------+--------+
|letter|position|
+------+--------+
|     A|       1|
|     B|       2|
|     C|       3|
+------+--------+



In [41]:
# Create Example Data - Departments and Employees

# Create the Departments
Department = Row("id", "name")
department1 = Department('123456', 'Computer Science')
department2 = Department('789012', 'Mechanical Engineering')
department3 = Department('345678', 'Theater and Drama')
department4 = Department('901234', 'Indoor Recreation')
department5 = Department('000000', 'All Students')

# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)
employee5 = Employee('michael', 'jackson', 'no-reply@neverla.nd', 80000)

# Create the DepartmentWithEmployees instances from Departments and Employees
DepartmentWithEmployees = Row("department", "employees")
departmentWithEmployees1 = DepartmentWithEmployees(department1, [employee1, employee2])
departmentWithEmployees2 = DepartmentWithEmployees(department2, [employee3, employee4])
departmentWithEmployees3 = DepartmentWithEmployees(department3, [employee5, employee4])
departmentWithEmployees4 = DepartmentWithEmployees(department4, [employee2, employee3])
departmentWithEmployees5 = DepartmentWithEmployees(department5, [employee1, employee2, employee3, employee4, employee5])

print(department1)
print(employee2)
print(departmentWithEmployees1.employees[0].email)

Row(id='123456', name='Computer Science')
Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)
no-reply@berkeley.edu


In [43]:
departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2, departmentWithEmployees3, departmentWithEmployees4, departmentWithEmployees5]
df1 = spark.createDataFrame(departmentsWithEmployeesSeq1)
df1.show(truncate=False)

+--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|department                      |employees                                                                                                                                                                                                                                 |
+--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[123456, Computer Science]      |[[michael, armbrust, no-reply@berkeley.edu, 100000], [xiangrui, meng, no-reply@stanford.edu, 120000]]                                                       

In [45]:
df1.show(vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 department | [123456, Computer Science]                                                                                                                                                                                                                 
 employees  | [[michael, armbrust, no-reply@berkeley.edu, 100000], [xiangrui, meng, no-reply@stanford.edu, 120000]]                                                                                                                                      
-RECORD 1------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [46]:
df = spark.range(1,8,2).toDF("number")
df.show()

+------+
|number|
+------+
|     1|
|     3|
|     5|
|     7|
+------+

