In [31]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

In [32]:
spark = SparkSession.builder.appName("Read CSV").getOrCreate()

In [33]:
path1 = "Data/Employee1.csv"
path2 = "Data/Employee2.csv"

In [34]:
df = spark.read.csv(path=path1)
df.show()

+---+-------+------+------+
|_c0|    _c1|   _c2|   _c3|
+---+-------+------+------+
| id|   name|gender|salary|
|  1| maheer|  male|  1000|
|  2|pradeep|  male|  2000|
+---+-------+------+------+



In [35]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [36]:
df2 = spark.read.csv(path=path1, header=True)
df2.show()

+---+-------+------+------+
| id|   name|gender|salary|
+---+-------+------+------+
|  1| maheer|  male|  1000|
|  2|pradeep|  male|  2000|
+---+-------+------+------+



In [37]:
df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)



In [38]:
df3 = spark.read.format('csv').option('header', True).load(path=path1)
df3.show()

+---+-------+------+------+
| id|   name|gender|salary|
+---+-------+------+------+
|  1| maheer|  male|  1000|
|  2|pradeep|  male|  2000|
+---+-------+------+------+



In [40]:
from pyspark.sql.types import *

In [42]:
schema = StructType().add(field='id', data_type=IntegerType())\
                .add(field='name', data_type=StringType())\
                .add(field='gender', data_type=StringType())\
                .add(field='salary', data_type=IntegerType())

df4 = spark.read.csv(path=path1, schema=schema, header=True)
df4.show()

+---+-------+------+------+
| id|   name|gender|salary|
+---+-------+------+------+
|  1| maheer|  male|  1000|
|  2|pradeep|  male|  2000|
+---+-------+------+------+



In [43]:
df4.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

