#### This tutorial covers:
    - PySpark Dataframe
    - Reading dataset
    - Checking the datatypes of columns
    - Selecting columns and indexing
    - Check-describe (similar to pandas)
    - Adding and Dropping columns
    - Renaming columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [22]:
# read the dataset
df_pyspark = spark.read.option('header', 'true').csv('test_data_tutorial2.csv', inferSchema=True)   
#inferSchema lets pyspark infer the correct schema. For instance values in cols 2 and 3 will be inferred as integers

In [8]:
# check the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)



In [9]:
# the instructions can be combined as follows
df_pyspark = spark.read.csv('test_data_tutorial2.csv', header=True,inferSchema=True)
df_pyspark.show()

+-----+---+---+
| name|age|exp|
+-----+---+---+
|  bob| 23|  7|
|saget| 26|  9|
|james| 21|  5|
+-----+---+---+



In [10]:
# check the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- exp: integer (nullable = true)



In [11]:
type(df_pyspark)    #dataframe is a type of a datastructure

pyspark.sql.dataframe.DataFrame

In [13]:
print(df_pyspark.columns)
print(df_pyspark.head(2))

['name', 'age', 'exp']
[Row(name='bob', age=23, exp=7), Row(name='saget', age=26, exp=9)]


In [17]:
#selecting a column
df_pyspark.select(['name', 'exp']).show()

#the following won't work. This will only show that it is a column
df_pyspark['name']

+-----+---+
| name|exp|
+-----+---+
|  bob|  7|
|saget|  9|
|james|  5|
+-----+---+



Column<'name'>

In [18]:
#checking datatypes
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('exp', 'int')]

In [19]:
df_pyspark.describe().show()

+-------+-----+------------------+---+
|summary| name|               age|exp|
+-------+-----+------------------+---+
|  count|    3|                 3|  3|
|   mean| NULL|23.333333333333332|7.0|
| stddev| NULL| 2.516611478423583|2.0|
|    min|  bob|                21|  5|
|    max|saget|                26|  9|
+-------+-----+------------------+---+



In [23]:
# Adding dropping columns
df_pyspark=df_pyspark.withColumn('exp after two years', df_pyspark['exp']+2)
df_pyspark.show()

+-----+---+---+-------------------+
| name|age|exp|exp after two years|
+-----+---+---+-------------------+
|  bob| 23|  7|                  9|
|saget| 26|  9|                 11|
|james| 21|  5|                  7|
+-----+---+---+-------------------+



In [24]:
#dropping columns
df_pyspark=df_pyspark.drop('exp after two years')
df_pyspark.show()

+-----+---+---+
| name|age|exp|
+-----+---+---+
|  bob| 23|  7|
|saget| 26|  9|
|james| 21|  5|
+-----+---+---+



In [25]:
# renaming columns
df_pyspark.withColumnRenamed('name', 'Name').show()

+-----+---+---+
| Name|age|exp|
+-----+---+---+
|  bob| 23|  7|
|saget| 26|  9|
|james| 21|  5|
+-----+---+---+

