# In this lesson we will go
- Pyspark Dataframe
- Reading The Dataset
- Checking the datatypes of the Column(Schema)
- Selecting Columns and Indexing
- Check Describe option similar to Pandas
- Adding columns
- Dropping columns
- Renaming columns

In [1]:
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
pyspark = SparkSession.builder.appName("DataFrames").getOrCreate()

In [3]:
pyspark

In [4]:
# read the dataset
df = pyspark.read.option('header', 'true').csv('./Pyspark-With-Python/test1.csv', inferSchema = True)

In [5]:
# check the schema
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_spark = pyspark.read.csv('./Pyspark-With-Python/test1.csv', header = 'true', inferSchema = True)
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [7]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [9]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [11]:
df_spark.head(5)

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000),
 Row(Name='Paul', age=24, Experience=3, Salary=20000),
 Row(Name='Harsha', age=21, Experience=1, Salary=15000)]

In [12]:
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [13]:
df_spark.select(["Name", "Salary"]).head(5)

[Row(Name='Krish', Salary=30000),
 Row(Name='Sudhanshu', Salary=25000),
 Row(Name='Sunny', Salary=20000),
 Row(Name='Paul', Salary=20000),
 Row(Name='Harsha', Salary=15000)]

In [14]:
df_spark["Name"]

Column<'Name'>

In [15]:
df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [17]:
df_spark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [19]:
# adding columns after 2 year
df_spark = df_spark.withColumn("Experience after 2 year", df_spark["Experience"] + 2)

In [20]:
df_spark.show()

+---------+---+----------+------+-----------------------+
|     Name|age|Experience|Salary|Experience after 2 year|
+---------+---+----------+------+-----------------------+
|    Krish| 31|        10| 30000|                     12|
|Sudhanshu| 30|         8| 25000|                     10|
|    Sunny| 29|         4| 20000|                      6|
|     Paul| 24|         3| 20000|                      5|
|   Harsha| 21|         1| 15000|                      3|
|  Shubham| 23|         2| 18000|                      4|
+---------+---+----------+------+-----------------------+



In [21]:
df_spark = df_spark.drop("Experience after 2 year")

In [22]:
df_spark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [23]:
# Rename column
df_spark = df_spark.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

