In [1]:
pip install pyspark



In [3]:
from pyspark.sql import SparkSession

# Create or open an existing Spark session
spark = SparkSession.builder \
    .appName("Spark_Session_1") \
    .getOrCreate()

# Create dataframes with schema:

It is ideal to specify the name and data type for each column, otherwise Spark would need to infer from the data what is the data type for each column, then to assign these data types to columns. This makes Spark less efficient.



In [11]:
data = [("Tom", 99), ("Jerry", 99)]

dataDF = spark.createDataFrame(data, schema = "name string, age int")

dataDF.show()

+-----+---+
| name|age|
+-----+---+
|  Tom| 99|
|Jerry| 99|
+-----+---+



#StructType & StructField

The StructType and StructField classes can be used to specify to the DataFrame about what is the schema.

These classes are also helpful with creating columns that are more complicaited.


In [7]:
from pyspark.sql.types import *


In [15]:
schema = StructType([StructField("title", StringType(), False),
StructField("author", StringType(), False),
StructField("pages", IntegerType(), False)])
#schema = “title string, author string, pages int”

data = [("first book", "1st author", 500), ("second book", "2nd author", 700)]

dataDF = spark.createDataFrame(data, schema)

dataDF.show()


+-----------+----------+-----+
|      title|    author|pages|
+-----------+----------+-----+
| first book|1st author|  500|
|second book|2nd author|  700|
+-----------+----------+-----+



See each column's name and data type

In [16]:
dataDF.printSchema()

root
 |-- title: string (nullable = false)
 |-- author: string (nullable = false)
 |-- pages: integer (nullable = false)



Show the data schema

In [17]:
dataDF.schema

StructType([StructField('title', StringType(), False), StructField('author', StringType(), False), StructField('pages', IntegerType(), False)])

#Columns

There are different ways to access a column

1.List all columns

2.In format : dataframe.column_name

3.In mathematical or in logical expression

In [18]:
#Method 1.

dataDF.columns

#Method 2.

dataDF.title

#Method 3.

dataDF.select(dataDF.pages * 2).show()

+-----------+
|(pages * 2)|
+-----------+
|       1000|
|       1400|
+-----------+



# Rows

In [None]:
from pyspark.sql import Row

In [19]:
# Given a list of rows containing names and ages

row_data = [Row(name = "Ben", age= 99), Row(name = "Jerry", age = 99)]

# Create DataFrame from a list of Rows

dataDF = spark.createDataFrame(row_data)

dataDF.show()

+-----+---+
| name|age|
+-----+---+
|  Ben| 99|
|Jerry| 99|
+-----+---+



#Projection and Filter

In [21]:
#Select()
dataDF.select(dataDF.name,(dataDF.age + 1).alias('age + 1')).show()

+-----+-------+
| name|age + 1|
+-----+-------+
|  Ben|    100|
|Jerry|    100|
+-----+-------+



In [22]:
#Filter()
dataDF.where(dataDF.age > 90).show()

+-----+---+
| name|age|
+-----+---+
|  Ben| 99|
|Jerry| 99|
+-----+---+

