# Pyspark Environment Setup

In [1]:
import os
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_PYTHON'] = 'python'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'

# Get started With First Spark Session Program

In [2]:
#Import SparkSession
from pyspark.sql import SparkSession

#Create a Spark Session
spark = SparkSession.builder \
        .appName("Sparksession") \
        .master("local") \
        .getOrCreate()

In [3]:
spark

# Creating Data Frame

In [4]:
# Creating an Empty RDD
rdd = spark.sparkContext.emptyRDD()
print(rdd)

EmptyRDD[0] at emptyRDD at NativeMethodAccessorImpl.java:0


In [5]:
from pyspark.sql.types import StructType, StructField, StringType

# Define the schema according to our wish.
schema = StructType([
    StructField('First name', StringType(), True),
    StructField('Middle name', StringType(), True),
    StructField('Last name', StringType(), True)
    ])

# load into Dataframe from the Empty RDD
df = spark.createDataFrame(rdd, schema)

In [6]:
# To check the Schema
df.printSchema()

root
 |-- First name: string (nullable = true)
 |-- Middle name: string (nullable = true)
 |-- Last name: string (nullable = true)



### Converting an existing RDD to Dataframe

In [7]:
df1 = rdd.toDF(schema)
print(df1)

DataFrame[First name: string, Middle name: string, Last name: string]


### Creating an empty DataFrame

In [8]:
df2 = spark.createDataFrame([],schema)
print(df2)

DataFrame[First name: string, Middle name: string, Last name: string]


### Creating Empty DataFrame without any schema

In [9]:
df3 = spark.createDataFrame([], StructType([]))
print(df3)

DataFrame[]


# Converting Pyspark RDD to DataFrame

### 1. Convert Pyspark RDD

In [10]:
# Creating a PySpark RDD
dept = [("Finance", 300), ("Marketing", 450), ("Promotions", 250)]
rdd = spark.sparkContext.parallelize(dept)

### 2. Convert Pyspark RDD to DataFrame

#### a. Using toDF()

In [11]:
# Without columns
df = rdd.toDF()
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [12]:
# With Desired Column Names
columns = ["Sector", "Id"]
df = rdd.toDF(columns)
df.printSchema()
df.show(truncate = False)

root
 |-- Sector: string (nullable = true)
 |-- Id: long (nullable = true)

+----------+---+
|Sector    |Id |
+----------+---+
|Finance   |300|
|Marketing |450|
|Promotions|250|
+----------+---+



#### b. createDataFrame() function

In [13]:
df1 = spark.createDataFrame(rdd, schema = columns)
df1.show()

+----------+---+
|    Sector| Id|
+----------+---+
|   Finance|300|
| Marketing|450|
|Promotions|250|
+----------+---+



# Creating a Data Frame into Pandas

In [14]:
df_pandas = df.toPandas()
print(df_pandas)

       Sector   Id
0     Finance  300
1   Marketing  450
2  Promotions  250


# Show()

In [15]:
columns = ["Seqno","Quote"]
data = [("1", "Be the change that you wish to see in the world"),
    ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
    ("3", "The purpose of our lives is to be happy."),
    ("4", "Be cool.")]
df = spark.createDataFrame(data,columns)

In [16]:
# display the contents upto 20 characters in the values of the columns
df.show()

+-----+--------------------+
|Seqno|               Quote|
+-----+--------------------+
|    1|Be the change tha...|
|    2|Everyone thinks o...|
|    3|The purpose of ou...|
|    4|            Be cool.|
+-----+--------------------+



In [17]:
# If you pass the value as a parameter to the show(), That number of rows will be displayed
df.show(3)   # here n = 3, stands for the number of rows to be displayed.

+-----+--------------------+
|Seqno|               Quote|
+-----+--------------------+
|    1|Be the change tha...|
|    2|Everyone thinks o...|
|    3|The purpose of ou...|
+-----+--------------------+
only showing top 3 rows



In [18]:
# When the truncate value is set to any value(viz, 40). Only 40 characters is displayed from the characters of the column value
df.show(truncate = 40)

+-----+----------------------------------------+
|Seqno|                                   Quote|
+-----+----------------------------------------+
|    1|Be the change that you wish to see in...|
|    2|Everyone thinks of changing the world...|
|    3|The purpose of our lives is to be happy.|
|    4|                                Be cool.|
+-----+----------------------------------------+



In [19]:
# When the truncate value is set to False, The Characters in the columns will be completely displayed
df.show(3, truncate = False)

+-----+-----------------------------------------------------------------------------+
|Seqno|Quote                                                                        |
+-----+-----------------------------------------------------------------------------+
|1    |Be the change that you wish to see in the world                              |
|2    |Everyone thinks of changing the world, but no one thinks of changing himself.|
|3    |The purpose of our lives is to be happy.                                     |
+-----+-----------------------------------------------------------------------------+
only showing top 3 rows



In [20]:
# If you want to display the details vertically, we use the vertical = true
df.show(n = 3, truncate = False, vertical = True)

-RECORD 0------------------------------------------------------------------------------
 Seqno | 1                                                                             
 Quote | Be the change that you wish to see in the world                               
-RECORD 1------------------------------------------------------------------------------
 Seqno | 2                                                                             
 Quote | Everyone thinks of changing the world, but no one thinks of changing himself. 
-RECORD 2------------------------------------------------------------------------------
 Seqno | 3                                                                             
 Quote | The purpose of our lives is to be happy.                                      
only showing top 3 rows



# StructType and StructField

## Define columns with schema

In [21]:
# Import the necessary libraries
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, FloatType

# Create schema
schema = StructType([
    StructField('Student Id', IntegerType(), True),
    StructField('First Name', StringType(), True),
    StructField('Last Name', StringType(), True),
    StructField('Attendance', FloatType(), True)
])

# Create Data to insert
data = [
    (1, "Santhosh", "Sharma", 89.07),
    (2, "Mahesh", "", 73.89),
    (3, "Yateesh", "Chandra", 92.676),
    (4, "Kranthi", "Chanpathi", 87.273),
    (5, "Samdhani", "", 84.30)
]

In [22]:
# Creating dataframe
df = spark.createDataFrame(data, schema)

# Viewing the Schema of the Data frame
df.printSchema()

root
 |-- Student Id: integer (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Attendance: float (nullable = true)



In [23]:
# Showing the data from the database
df.show()

+----------+----------+---------+----------+
|Student Id|First Name|Last Name|Attendance|
+----------+----------+---------+----------+
|         1|  Santhosh|   Sharma|     89.07|
|         2|    Mahesh|         |     73.89|
|         3|   Yateesh|  Chandra|    92.676|
|         4|   Kranthi|Chanpathi|    87.273|
|         5|  Samdhani|         |      84.3|
+----------+----------+---------+----------+



## Nesting the Schema

In [24]:
# Nesting the data
structure_data = [
    (1, ("Dhilli", ""), 23.40),
    (2, ("Rolex", "Watson"), 85.34),
    (3, ("Amar",""), 12.43),
    (4, ("Leo", "Das"), 98.23),
    (5, ("Vikram", "Iyer"), 2.09)
]

# Nested Schema Structure
structured_schema = StructType([
    StructField("Criminal Id", IntegerType(), True),
    StructField("name", StructType([
        StructField("first Name", StringType(), True),
        StructField("last Name", StringType(), True)
    ])),
    StructField("Criminal Percent", FloatType(), True)
])

In [25]:
lcu_df = spark.createDataFrame(structure_data, structured_schema)

In [26]:
lcu_df.printSchema()

root
 |-- Criminal Id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first Name: string (nullable = true)
 |    |-- last Name: string (nullable = true)
 |-- Criminal Percent: float (nullable = true)



In [27]:
lcu_df.show(truncate = False)

+-----------+---------------+----------------+
|Criminal Id|name           |Criminal Percent|
+-----------+---------------+----------------+
|1          |{Dhilli, }     |23.4            |
|2          |{Rolex, Watson}|85.34           |
|3          |{Amar, }       |12.43           |
|4          |{Leo, Das}     |98.23           |
|5          |{Vikram, Iyer} |2.09            |
+-----------+---------------+----------------+



## Updating the structure of the dataframe

In [28]:
from pyspark.sql.functions import col, when, struct

# Updating the schema
updated_df = df.withColumn("Other Info",
        struct(col("Student Id").alias("id"),
               col("First Name").alias("first_name"),
               col("last Name").alias("last_name"),
               col("Attendance").alias("percent"),
               when(col("Attendance").cast(IntegerType()) < 75, "Not Eligible")
               .otherwise("Eligible").alias("Eligibility")
        ))
#     ).drop("Student Id", "First Name", "Last Name", "Attendance")

updated_df.printSchema()
updated_df.show(truncate = False)

root
 |-- Student Id: integer (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Attendance: float (nullable = true)
 |-- Other Info: struct (nullable = false)
 |    |-- id: integer (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |    |-- percent: float (nullable = true)
 |    |-- Eligibility: string (nullable = false)

+----------+----------+---------+----------+-----------------------------------------+
|Student Id|First Name|Last Name|Attendance|Other Info                               |
+----------+----------+---------+----------+-----------------------------------------+
|1         |Santhosh  |Sharma   |89.07     |{1, Santhosh, Sharma, 89.07, Eligible}   |
|2         |Mahesh    |         |73.89     |{2, Mahesh, , 73.89, Not Eligible}       |
|3         |Yateesh   |Chandra  |92.676    |{3, Yateesh, Chandra, 92.676, Eligible}  |
|4         |Kranthi   |Chanpathi|87.2

## Adding the new columns to the DataFrame

In [29]:
updated_df_with_eligibility = df.withColumn("Eligibility",
                                           when(col("Attendance").cast(IntegerType()) < 75, "No")
                                           .otherwise("Yes"))


In [30]:
updated_df_with_eligibility.printSchema()

root
 |-- Student Id: integer (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Attendance: float (nullable = true)
 |-- Eligibility: string (nullable = false)



In [31]:
updated_df_with_eligibility.show(truncate = False)

+----------+----------+---------+----------+-----------+
|Student Id|First Name|Last Name|Attendance|Eligibility|
+----------+----------+---------+----------+-----------+
|1         |Santhosh  |Sharma   |89.07     |Yes        |
|2         |Mahesh    |         |73.89     |No         |
|3         |Yateesh   |Chandra  |92.676    |Yes        |
|4         |Kranthi   |Chanpathi|87.273    |Yes        |
|5         |Samdhani  |         |84.3      |Yes        |
+----------+----------+---------+----------+-----------+



## Using SQL Array and Map Types

In [32]:
# Import the arrayType and MapType from the pyspark.sql.types
from pyspark.sql.types import ArrayType, MapType

arrayAndMapSchema = StructType([
    StructField("First Name",StringType(), True),
    StructField("Last Name", StringType(), True),
    StructField("Attendance", FloatType(), True),
    StructField("Hobbies", ArrayType(StringType()), True),
    StructField("Properties", MapType(StringType(), StringType()), True)
])

In [33]:
# If there are more number of columns,we can use the schema.json() method to print the Schema in json format.
print(df.schema.json())

{"fields":[{"metadata":{},"name":"Student Id","nullable":true,"type":"integer"},{"metadata":{},"name":"First Name","nullable":true,"type":"string"},{"metadata":{},"name":"Last Name","nullable":true,"type":"string"},{"metadata":{},"name":"Attendance","nullable":true,"type":"float"}],"type":"struct"}


In [34]:
# To print it in the simplest format, we can use the simpleString() method
print(df.schema.simpleString())

struct<Student Id:int,First Name:string,Last Name:string,Attendance:float>


# Renaming Columns

In [35]:
df = df.withColumnRenamed("First Name", "first_name")\
    .withColumnRenamed("Last Name", "last_name")\
        .withColumnRenamed("Student Id", "student_id")

In [36]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

data_df = spark.createDataFrame(data = dataDF, schema = schema)
data_df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [37]:
newColumns = ["newCol1","newCol2","newCol3","newCol4"]
data_df.toDF(*newColumns).printSchema()

root
 |-- newCol1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- newCol2: string (nullable = true)
 |-- newCol3: string (nullable = true)
 |-- newCol4: integer (nullable = true)



In [38]:
data_df.columns

['name', 'dob', 'gender', 'salary']

# Column Objects

In [39]:
from pyspark.sql.functions import lit
colObj  = lit("sparkbyexamples.com")

In [40]:
lcu_df.printSchema()

root
 |-- Criminal Id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first Name: string (nullable = true)
 |    |-- last Name: string (nullable = true)
 |-- Criminal Percent: float (nullable = true)



In [41]:
# Using . operator
df.select(df.Attendance).show()

+----------+
|Attendance|
+----------+
|     89.07|
|     73.89|
|    92.676|
|    87.273|
|      84.3|
+----------+



In [42]:
# Using df["Column Name"]
df.select(df["first_name"], df["last_name"]).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|  Santhosh|   Sharma|
|    Mahesh|         |
|   Yateesh|  Chandra|
|   Kranthi|Chanpathi|
|  Samdhani|         |
+----------+---------+



In [43]:
# Using col function
from pyspark.sql.functions import col
df.select(col("student_id"),col("first_name")).show()

+----------+----------+
|student_id|first_name|
+----------+----------+
|         1|  Santhosh|
|         2|    Mahesh|
|         3|   Yateesh|
|         4|   Kranthi|
|         5|  Samdhani|
+----------+----------+



In [44]:
lcu_df.select(lcu_df["name.first name"]).show()

+----------+
|first name|
+----------+
|    Dhilli|
|     Rolex|
|      Amar|
|       Leo|
|    Vikram|
+----------+



## Creating a DataFrame using Row Function

In [45]:
from pyspark.sql import Row
data = [Row(name = "James", prop = Row(hair = "black", eye = "brown")),
       Row(name = "Rahul", prop = Row(hair = "blue", eye = "reddish"))]

df_prop = spark.createDataFrame(data)

In [46]:
df_prop.select(col("prop.*")).show(1)

+-----+-----+
| hair|  eye|
+-----+-----+
|black|brown|
+-----+-----+
only showing top 1 row



## Arithmetic operations

In [47]:
data = [(150, 23, 8), (180, 43, 5), (129, 85, 1)]
ndf = spark.createDataFrame(data).toDF("col1", "col2", "col3")

In [48]:
ndf.show(1)

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 150|  23|   8|
+----+----+----+
only showing top 1 row



In [49]:
ndf.select((ndf["col1"] + ndf["col2"]).alias("Sum")).show()
ndf.select(ndf["col1"] - ndf["col2"]).show()
ndf.select(ndf["col1"] * ndf["col3"]).show()
ndf.select(ndf["col1"] / ndf["col3"]).show()
ndf.select(ndf["col1"] % ndf["col3"]).show()
print("----------------------------------------------------------------------")
ndf.select(ndf["col1"] < ndf["col3"]).show()
ndf.select(ndf["col1"] > ndf["col3"]).show()
ndf.select(ndf["col1"] <= ndf["col3"]).show()
ndf.select(ndf["col1"] >= ndf["col3"]).show()
ndf.select(ndf["col1"] == ndf["col3"]).show()

+---+
|Sum|
+---+
|173|
|223|
|214|
+---+

+-------------+
|(col1 - col2)|
+-------------+
|          127|
|          137|
|           44|
+-------------+

+-------------+
|(col1 * col3)|
+-------------+
|         1200|
|          900|
|          129|
+-------------+

+-------------+
|(col1 / col3)|
+-------------+
|        18.75|
|         36.0|
|        129.0|
+-------------+

+-------------+
|(col1 % col3)|
+-------------+
|            6|
|            0|
|            0|
+-------------+

----------------------------------------------------------------------
+-------------+
|(col1 < col3)|
+-------------+
|        false|
|        false|
|        false|
+-------------+

+-------------+
|(col1 > col3)|
+-------------+
|         true|
|         true|
|         true|
+-------------+

+--------------+
|(col1 <= col3)|
+--------------+
|         false|
|         false|
|         false|
+--------------+

+--------------+
|(col1 >= col3)|
+--------------+
|          true|
|          true|
|  

## PySpark Column Functions

In [50]:
# substr (starting Position, length of the substring you wish to return) 
df.select(col("last_name").substr(1,2)).show()

# starts with checks whether the item starts with a specific character or not
df.select(col("last_name").startswith('C')).show()

# ends with checks whether the item starts ends with a specific character or not
df.select(col("last_name").endswith('a')).show()

+--------------------------+
|substring(last_name, 1, 2)|
+--------------------------+
|                        Sh|
|                          |
|                        Ch|
|                        Ch|
|                          |
+--------------------------+

+------------------------+
|startswith(last_name, C)|
+------------------------+
|                   false|
|                   false|
|                    true|
|                    true|
|                   false|
+------------------------+

+----------------------+
|endswith(last_name, a)|
+----------------------+
|                  true|
|                 false|
|                  true|
|                 false|
|                 false|
+----------------------+

