Pyspark DataFrame

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark

In [11]:
df_pyspark = spark.read.option('header','true').csv('Info_1.csv', inferSchema = True)
#inferSchema will give proper datatype of the features
df_pyspark

DataFrame[Name: string, Type: string, Qty: int]

In [12]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Qty: integer (nullable = true)



In [14]:
# Otherway of reading data
df_pyspark = spark.read.csv('Info_1.csv', header=True,inferSchema=True)
df_pyspark.show()

+------+---------+---+
|  Name|     Type|Qty|
+------+---------+---+
| Apple|    Fruit|  2|
| Onion|Vegetable|  3|
|Carrot|Vegetable|  1|
+------+---------+---+



In [15]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Qty: integer (nullable = true)



In [16]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [17]:
df_pyspark.columns

['Name', 'Type', 'Qty']

In [26]:
#Selecting columns (slicing wont work in pyspark)
print(df_pyspark.select('Name').show())
print(type(df_pyspark.select('Name')))
#multiple columns
print(df_pyspark.select(['Name','Qty']).show())

+------+
|  Name|
+------+
| Apple|
| Onion|
|Carrot|
+------+

None
<class 'pyspark.sql.dataframe.DataFrame'>
+------+---+
|  Name|Qty|
+------+---+
| Apple|  2|
| Onion|  3|
|Carrot|  1|
+------+---+

None


In [27]:
df_pyspark.dtypes

[('Name', 'string'), ('Type', 'string'), ('Qty', 'int')]

In [34]:
df_pyspark.describe().show()

+-------+-----+---------+---+
|summary| Name|     Type|Qty|
+-------+-----+---------+---+
|  count|    3|        3|  3|
|   mean| null|     null|2.0|
| stddev| null|     null|1.0|
|    min|Apple|    Fruit|  1|
|    max|Onion|Vegetable|  3|
+-------+-----+---------+---+



In [37]:
#Adding columns to the dataframe
df_pyspark=df_pyspark.withColumn('Qty after adding 2', df_pyspark['Qty']+2)
df_pyspark.show()

+------+---------+---+------------------+
|  Name|     Type|Qty|Qty after adding 2|
+------+---------+---+------------------+
| Apple|    Fruit|  2|                 4|
| Onion|Vegetable|  3|                 5|
|Carrot|Vegetable|  1|                 3|
+------+---------+---+------------------+



In [41]:
#Dropping the columns
df_pyspark = df_pyspark.drop('Qty after adding 2')
df_pyspark.show()

+------+---------+---+
|  Name|     Type|Qty|
+------+---------+---+
| Apple|    Fruit|  2|
| Onion|Vegetable|  3|
|Carrot|Vegetable|  1|
+------+---------+---+



In [44]:
#Rename the column
df_pyspark= df_pyspark.withColumnRenamed('Name', 'New Name')
df_pyspark.show()

+--------+---------+---+
|New Name|     Type|Qty|
+--------+---------+---+
|   Apple|    Fruit|  2|
|   Onion|Vegetable|  3|
|  Carrot|Vegetable|  1|
+--------+---------+---+

