# Pyspark Dataframe Part 1 : Set Pyspark and Dataframe
We will look into 
- pyspark dataframe
- read the dataset
- check data type of column (schema)
- selecting column and indexing
- check describe similir to pandas
- add columns
- drop columns


#### Set spark instances

In [3]:
import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test01').getOrCreate()

In [7]:
spark

#### Set dataset and print schema

In [6]:
# read dataset
df_pyspark = spark.read.option('header', 'true').csv('customer.csv')
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Zikri| 24|         1|
|Zakhwan| 25|         1|
|   Amir| 27|         2|
|  Ammar| 30|         6|
|  Haziq| 24|         2|
|  Irfan| 25|         1|
|  Fahmi| 27|         5|
+-------+---+----------+



In [9]:
# check schema
# notice that age and experince should be in int data type
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [None]:
# read dataset and add schema

In [10]:
df_pyspark = spark.read.option('header', 'true').csv('customer.csv', inferSchema = True)
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Zikri| 24|         1|
|Zakhwan| 25|         1|
|   Amir| 27|         2|
|  Ammar| 30|         6|
|  Haziq| 24|         2|
|  Irfan| 25|         1|
|  Fahmi| 27|         5|
+-------+---+----------+



In [11]:
# now we get the true schema
# name as string
# age and experince as integer
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [16]:
# another way to read csv
df_pyspark = spark.read.csv('customer.csv', header = True, inferSchema = True)
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Zikri| 24|         1|
|Zakhwan| 25|         1|
|   Amir| 27|         2|
|  Ammar| 30|         6|
|  Haziq| 24|         2|
|  Irfan| 25|         1|
|  Fahmi| 27|         5|
+-------+---+----------+



In [17]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



#### Basic operation of dataframe

In [18]:
# type of data
# type of df_pyspark is dataframe (pyspark dataframe)

type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [19]:
# get the columns of df

df_pyspark.columns

['Name', 'Age', 'Experience']

In [21]:
# get the head of df

df_pyspark.head(3)

[Row(Name='Zikri', Age=24, Experience=1),
 Row(Name='Zakhwan', Age=25, Experience=1),
 Row(Name='Amir', Age=27, Experience=2)]

In [22]:
# get tge tail of df

df_pyspark.tail(3)

[Row(Name='Haziq', Age=24, Experience=2),
 Row(Name='Irfan', Age=25, Experience=1),
 Row(Name='Fahmi', Age=27, Experience=5)]

In [31]:
# select the column

df_pyspark.select('Name').show()
df_pyspark.select('Age').show()

+-------+
|   Name|
+-------+
|  Zikri|
|Zakhwan|
|   Amir|
|  Ammar|
|  Haziq|
|  Irfan|
|  Fahmi|
+-------+

+---+
|Age|
+---+
| 24|
| 25|
| 27|
| 30|
| 24|
| 25|
| 27|
+---+



In [29]:
# select multiple columns

df_pyspark.select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|  Zikri| 24|
|Zakhwan| 25|
|   Amir| 27|
|  Ammar| 30|
|  Haziq| 24|
|  Irfan| 25|
|  Fahmi| 27|
+-------+---+



In [33]:
# get the datatype of all columns

df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [36]:
# get the describe option
# get mean min max stddev 

df_pyspark.describe().show()

+-------+-----+-----------------+------------------+
|summary| Name|              Age|        Experience|
+-------+-----+-----------------+------------------+
|  count|    7|                7|                 7|
|   mean| NULL|             26.0|2.5714285714285716|
| stddev| NULL|2.160246899469287|2.0701966780270626|
|    min| Amir|               24|                 1|
|    max|Zikri|               30|                 6|
+-------+-----+-----------------+------------------+



In [42]:
# add columns in the dataframe

df_pyspark = df_pyspark.withColumn('Experience After 2 Years', (df_pyspark['Experience'] + 2))
df_pyspark.show()

+-------+---+----------+------------------------+
|   Name|Age|Experience|Experience After 2 Years|
+-------+---+----------+------------------------+
|  Zikri| 24|         1|                       3|
|Zakhwan| 25|         1|                       3|
|   Amir| 27|         2|                       4|
|  Ammar| 30|         6|                       8|
|  Haziq| 24|         2|                       4|
|  Irfan| 25|         1|                       3|
|  Fahmi| 27|         5|                       7|
+-------+---+----------+------------------------+



In [43]:
# drop the columns from dataframe

df_pyspark = df_pyspark.drop('Experience After 2 Years')
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Zikri| 24|         1|
|Zakhwan| 25|         1|
|   Amir| 27|         2|
|  Ammar| 30|         6|
|  Haziq| 24|         2|
|  Irfan| 25|         1|
|  Fahmi| 27|         5|
+-------+---+----------+



In [45]:
# Rename of datagrame
df_pyspark = df_pyspark.withColumnRenamed('Name', 'Customer Name')
df_pyspark.show()

+-------------+---+----------+
|Customer Name|Age|Experience|
+-------------+---+----------+
|        Zikri| 24|         1|
|      Zakhwan| 25|         1|
|         Amir| 27|         2|
|        Ammar| 30|         6|
|        Haziq| 24|         2|
|        Irfan| 25|         1|
|        Fahmi| 27|         5|
+-------------+---+----------+

