In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

# We will cover
* PySpark Dataframe
* Reading a dataset
* Checking the datatypes of the columns (schema)
* selecting columns and indexing
* check describe option similar to pandas
* adding columns
* dropping columns

In [2]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [8]:
# read the dataset
df_pyspark=spark.read.option('header','true').csv('data/test1.csv')

In [9]:
df_pyspark.show()

+---------+---+----------+
|     name|age|experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [10]:
# check the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- experience: string (nullable = true)



In [11]:
# using inferSchema
df_pyspark=spark.read.option('header','true').csv('data/test1.csv', inferSchema=True)

In [12]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [13]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.columns

['name', 'age', 'experience']

In [15]:
# head returns a list of Row() objects
df_pyspark.head(3)

[Row(name='Krish', age=31, experience=10),
 Row(name='Sudhanshu', age=30, experience=8),
 Row(name='Sunny', age=29, experience=4)]

In [17]:
# return type is dataframe (from .select())
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
+---------+



In [18]:
# returning subset of columns
df_pyspark.select(['Name', 'experience']).show()

+---------+----------+
|     Name|experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
+---------+----------+



In [19]:
# get datatypes for columns
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [20]:
# describe functionality
# it will run over non-numeric columns as well (differs from pandas)
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| name| age|       experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



In [23]:
# adding columns to a dataframe
new_df = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience']+2)

In [24]:
new_df.show()

+---------+---+----------+------------------------+
|     name|age|experience|Experience After 2 years|
+---------+---+----------+------------------------+
|    Krish| 31|        10|                      12|
|Sudhanshu| 30|         8|                      10|
|    Sunny| 29|         4|                       6|
+---------+---+----------+------------------------+



In [25]:
# drop the columns
new_df = new_df.drop('Experience After 2 years')

In [26]:
new_df.show()

+---------+---+----------+
|     name|age|experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [None]:
# rename the columns
df_pyspark.withColumnRenamed('Name', 'New Name').show(