# Pyspark Dataframe Part 0: Overview of Pyspark

In [1]:
import pyspark 
import pandas as pd

In [2]:
df = pd.read_csv('customer.csv')
df

Unnamed: 0,Name,Age
0,Zikri,24
1,Zakhwan,25
2,Amir,27
3,Ammar,24
4,Haziq,24
5,Irfan,25
6,Fahmi,25


#### Start spark session

In [3]:
from pyspark.sql import SparkSession

In [4]:
# create basic spark session 
# test01 is session name
spark = SparkSession.builder.appName('test01').getOrCreate()

#### Spark version and descriptions

In [5]:
spark

#### Write dataset in regard to spark

In [5]:
df_pyspark = spark.read.csv('customer.csv')

In [6]:
# there are two columns : c0 and c1
df_pyspark

DataFrame[_c0: string, _c1: string]

In [7]:
# show the pyspark information
df_pyspark.show()

+-------+---+
|    _c0|_c1|
+-------+---+
|   Name|Age|
|  Zikri| 24|
|Zakhwan| 25|
|   Amir| 27|
|  Ammar| 24|
|  Haziq| 24|
|  Irfan| 25|
|  Fahmi| 25|
+-------+---+



In [8]:
# make name and age as column name
spark.read.option('header', 'true').csv('customer.csv')

DataFrame[Name: string, Age: string]

In [9]:
spark.read.option('header', 'true').csv('customer.csv').show()

+-------+---+
|   Name|Age|
+-------+---+
|  Zikri| 24|
|Zakhwan| 25|
|   Amir| 27|
|  Ammar| 24|
|  Haziq| 24|
|  Irfan| 25|
|  Fahmi| 25|
+-------+---+



In [10]:
# save as variable
df_pyspark = spark.read.option('header', 'true').csv('customer.csv')

In [11]:
# the type of the variable is pyspark dataframe
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
df_pyspark.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Zikri| 24|
|Zakhwan| 25|
|   Amir| 27|
|  Ammar| 24|
|  Haziq| 24|
|  Irfan| 25|
|  Fahmi| 25|
+-------+---+



In [15]:
# we can use head as pandas
df_pyspark.head()

Row(Name='Zikri', Age='24')

In [16]:
# printSchema() -> tell about the column 
# name as string and age as string
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)

