# Here we are going to cover following things

* Reading dataset in csv file
* Checking the schema of dataset(data type for each columns)
* Selecting columns
* Get basic statistics of dataset
* Adding columns
* Dropping columns
* Renaming columns

In [2]:
import pyspark
from pyspark.sql import SparkSession 

In [5]:
spark_session = SparkSession.builder.appName("pyspark_dataframe_part_01").getOrCreate()
spark_session

In [65]:
# there are multiple ways to read csv files
# inferSchema is used to check field data types(If it is false then all data types consider as string)
# method 01
pyspark_df1 = (spark_session
               .read.csv("file:///mnt/92D26AE0D26AC7D5/Python/pyspark/ararental.csv",
                         header=True,
                         inferSchema=True))
pyspark_df1

DataFrame[Vendor_ID: int, Vendor_Name: string, Address: string, City: string, Phone Number: string, Fax: string]

In [66]:
# method 02
pyspark_df2 = (spark_session
    .read.option("header", True)
    .csv("file:///mnt/92D26AE0D26AC7D5/Python/pyspark/ararental.csv", inferSchema=True))
pyspark_df2

DataFrame[Vendor_ID: int, Vendor_Name: string, Address: string, City: string, Phone Number: string, Fax: string]

In [67]:
# we can also use printSchema method to print data type for each field
pyspark_df1.printSchema()

root
 |-- Vendor_ID: integer (nullable = true)
 |-- Vendor_Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Phone Number: string (nullable = true)
 |-- Fax: string (nullable = true)



In [68]:
# show() method is used to print dataframe data
pyspark_df1.show()

+---------+--------------------+--------------------+--------------+--------------+--------------+
|Vendor_ID|         Vendor_Name|             Address|          City|  Phone Number|           Fax|
+---------+--------------------+--------------------+--------------+--------------+--------------+
|   185227|American Rental S...|     100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|
|    16054|        Attema Sales|       117 E 13th St|         Pella|952  -913-2229|641  -628-4983|
|   195852|         B & S Sales|       218 Maquan St|        Hanson|781  -760-4590|          null|
|    35600|    Beal Enterprises|         PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|
|      761|          Billy Goat|1803 SW Jefferson St|   Lees Summit|816  -524-9666|816  -524-6983|
+---------+--------------------+--------------------+--------------+--------------+--------------+



In [69]:
# pyspark dataframe has 'columns' that we can get its' columns as a list
pyspark_df1.columns

['Vendor_ID', 'Vendor_Name', 'Address', 'City', 'Phone Number', 'Fax']

In [70]:
# head() method gives list of records
pyspark_df1.head(3)

[Row(Vendor_ID=185227, Vendor_Name='American Rental Sales Consultants', Address='100 Grantley Ct', City='Sandy Springs', Phone Number='714  -580-4291', Fax='800  -714-7422'),
 Row(Vendor_ID=16054, Vendor_Name='Attema Sales', Address='117 E 13th St', City='Pella', Phone Number='952  -913-2229', Fax='641  -628-4983'),
 Row(Vendor_ID=195852, Vendor_Name='B & S Sales', Address='218 Maquan St', City='Hanson', Phone Number='781  -760-4590', Fax=None)]

In [71]:
# we can use select() method to select specific column or multiple columns
selected_columns = pyspark_df1.select("Vendor_ID", "Vendor_Name")
selected_columns

DataFrame[Vendor_ID: int, Vendor_Name: string]

In [72]:
# here we can see, selected column also is a pyspark dataframe
type(selected_columns)

pyspark.sql.dataframe.DataFrame

In [73]:
# with combining select() with show() we can print data in selected column
pyspark_df1.select("Vendor_ID", "Vendor_Name").show()

+---------+--------------------+
|Vendor_ID|         Vendor_Name|
+---------+--------------------+
|   185227|American Rental S...|
|    16054|        Attema Sales|
|   195852|         B & S Sales|
|    35600|    Beal Enterprises|
|      761|          Billy Goat|
+---------+--------------------+



In [74]:
# dtypes can be used to get data type for each field like in schema
pyspark_df1.dtypes

[('Vendor_ID', 'int'),
 ('Vendor_Name', 'string'),
 ('Address', 'string'),
 ('City', 'string'),
 ('Phone Number', 'string'),
 ('Fax', 'string')]

In [75]:
# with describe we can get statistical information for column
pyspark_df1.describe().show()

+-------+---------------+--------------------+---------------+--------------+--------------+--------------+
|summary|      Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|
+-------+---------------+--------------------+---------------+--------------+--------------+--------------+
|  count|              5|                   5|              5|             5|             5|             4|
|   mean|        86698.8|                null|           null|          null|          null|          null|
| stddev|95667.794819887|                null|           null|          null|          null|          null|
|    min|            761|American Rental S...|100 Grantley Ct|        Hanson|626  -367-9157|641  -628-4983|
|    max|         195852|          Billy Goat|    PO Box 3374|South Pasadena|952  -913-2229|818  -276-8409|
+-------+---------------+--------------------+---------------+--------------+--------------+--------------+



In [104]:
# with column can be used to add new column to the existing dataframe
updated_pyspark_df = pyspark_df1.withColumn("New_Vendor_Name", pyspark_df1["Vendor_Name"]+"Up")
updated_pyspark_df.show()

+---------+--------------------+--------------------+--------------+--------------+--------------+---------------+
|Vendor_ID|         Vendor_Name|             Address|          City|  Phone Number|           Fax|New_Vendor_Name|
+---------+--------------------+--------------------+--------------+--------------+--------------+---------------+
|   185227|American Rental S...|     100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|           null|
|    16054|        Attema Sales|       117 E 13th St|         Pella|952  -913-2229|641  -628-4983|           null|
|   195852|         B & S Sales|       218 Maquan St|        Hanson|781  -760-4590|          null|           null|
|    35600|    Beal Enterprises|         PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|           null|
|      761|          Billy Goat|1803 SW Jefferson St|   Lees Summit|816  -524-9666|816  -524-6983|           null|
+---------+--------------------+--------------------+--------------+------------

In [105]:
# drop() method is used to drop given column
droped_pyspark_df = updated_pyspark_df.drop("New_Vendor_Name")
droped_pyspark_df.show()

+---------+--------------------+--------------------+--------------+--------------+--------------+
|Vendor_ID|         Vendor_Name|             Address|          City|  Phone Number|           Fax|
+---------+--------------------+--------------------+--------------+--------------+--------------+
|   185227|American Rental S...|     100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|
|    16054|        Attema Sales|       117 E 13th St|         Pella|952  -913-2229|641  -628-4983|
|   195852|         B & S Sales|       218 Maquan St|        Hanson|781  -760-4590|          null|
|    35600|    Beal Enterprises|         PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|
|      761|          Billy Goat|1803 SW Jefferson St|   Lees Summit|816  -524-9666|816  -524-6983|
+---------+--------------------+--------------------+--------------+--------------+--------------+



In [109]:
# withColumnRenamed is used to rename given column
renamed_pyspark_df = droped_pyspark_df.withColumnRenamed("Phone Number", "Phone_Number")
renamed_pyspark_df.show()

+---------+--------------------+--------------------+--------------+--------------+--------------+
|Vendor_ID|         Vendor_Name|             Address|          City|  Phone_Number|           Fax|
+---------+--------------------+--------------------+--------------+--------------+--------------+
|   185227|American Rental S...|     100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|
|    16054|        Attema Sales|       117 E 13th St|         Pella|952  -913-2229|641  -628-4983|
|   195852|         B & S Sales|       218 Maquan St|        Hanson|781  -760-4590|          null|
|    35600|    Beal Enterprises|         PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|
|      761|          Billy Goat|1803 SW Jefferson St|   Lees Summit|816  -524-9666|816  -524-6983|
+---------+--------------------+--------------------+--------------+--------------+--------------+

