## Create the folder and View the files in folder

In [0]:

dbutils.fs.mkdirs("dbfs:/FileStore/baby_names")

Out[9]: True

In [0]:
# dbutils.fs.mv("dbfs:/FileStore/baby_names1.csv", "dbfs:/FileStore/baby_names")
# dbutils.fs.mv("dbfs:/FileStore/baby_names2.csv", "dbfs:/FileStore/baby_names")
# dbutils.fs.mv("dbfs:/FileStore/baby_names3.csv", "dbfs:/FileStore/baby_names")

Out[12]: True

In [0]:
# View the files in folder

dbutils.fs.ls("dbfs:/FileStore/baby_names")

Out[14]: [FileInfo(path='dbfs:/FileStore/baby_names/baby_names1.csv', name='baby_names1.csv', size=1159851, modificationTime=1723283473000),
 FileInfo(path='dbfs:/FileStore/baby_names/baby_names2.csv', name='baby_names2.csv', size=1881206, modificationTime=1723283489000),
 FileInfo(path='dbfs:/FileStore/baby_names/baby_names3.csv', name='baby_names3.csv', size=2291890, modificationTime=1723283490000)]

## Read csv file -- option

In [0]:
df = spark.read.format("csv").option("header" , False).option("sep", ",").option("inferSchema", True).load("dbfs:/FileStore/baby_names/baby_names1.csv")

df.display(10)
# print(df.count())
 

_c0,_c1,_c2,_c3,_c4
year,first_name,country,sex,count
2017,Name94,South Korea,Female,4727
2021,Name489,India,Male,4473
2016,Name409,United States,Male,5155
2015,Name765,Argentina,Male,904
2017,Name846,South Korea,Female,1711
2010,Name161,United States,Male,1938
2011,Name843,Mexico,Male,4503
2015,Name741,Australia,Male,2027
2012,Name980,Argentina,Female,103


## Another Way reading csv file --"Options"

In [0]:


df = spark.read.format("csv").options(header="True",sep=",",inferSchema="True").load("dbfs:/FileStore/baby_names/baby_names1.csv")

df.display(10)
# print(df.count())

year,first_name,country,sex,count
2017,Name94,South Korea,Female,4727
2021,Name489,India,Male,4473
2016,Name409,United States,Male,5155
2015,Name765,Argentina,Male,904
2017,Name846,South Korea,Female,1711
2010,Name161,United States,Male,1938
2011,Name843,Mexico,Male,4503
2015,Name741,Australia,Male,2027
2012,Name980,Argentina,Female,103
2017,Name270,Mexico,Male,1345


## Read multiple csv files -- use [<file1 , file2>]

In [0]:
"""
for multiple file format use square bracket and give files in that  [<file1 , file2>]

"""
df1 = spark.read.format("csv") \
      .option("header" , True) \
      .option("inferSchame" , True) \
      .option("sep" , ",") \
      .load(["dbfs:/FileStore/baby_names/baby_names1.csv" , "dbfs:/FileStore/baby_names/baby_names2.csv"])

In [0]:
print(df.count())
print(df1.count())

33670
88205


## Read files in folder -- give folder name in load

In [0]:
df2 = spark.read.format("csv") \
      .option("header" , True) \
      .option("inferSchame" , True) \
      .option("sep" , ",") \
      .load(["dbfs:/FileStore/baby_names/"])

print(df2.count())

154645


In [0]:
df2.printSchema()

root
 |-- year: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- count: string (nullable = true)



## Create your own schema

In [0]:
from pyspark.sql.types import StructField , StructType , IntegerType , StringType

Defined_Schama = StructType([
          StructField("year" ,IntegerType() , True ),
          StructField("first_name" ,StringType() , True ),
          StructField("country" ,StringType() , True ),
          StructField("sex" ,StringType() , True ),
          StructField("count" ,IntegerType() , True )
])

In [0]:
df4 = spark.read.format("csv").option("header" , True).option("sep" , ',').schema(Defined_Schama).load("dbfs:/FileStore/baby_names/")

In [0]:
df4.printSchema()

root
 |-- year: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- count: integer (nullable = true)




InferSchema vs. Explicitly Giving Schema

When loading data into a Spark DataFrame, you have two primary options to define the schema:

InferSchema
Automatically determines the schema: Spark analyzes the data to infer the data types of each column.
Convenient for initial exploration: Useful when you're unsure about the data structure.
Performance overhead: Spark needs to scan the entire dataset twice: once for schema inference and once for data loading.

Explicitly Giving Schema
Manually define the schema: You provide the column names and data types.
Better performance: Avoids the extra pass over the data for schema inference, leading to faster loading.
Data quality control: Ensures data consistency and accuracy by enforcing the defined schema.
Requires prior knowledge: You need to know the data structure beforehand.

When to Use Which

Use InferSchema:
When you're uncertain about the data structure.
For small datasets where performance is not critical.
During exploratory data analysis.

Use Explicit Schema:
When you know the data structure beforehand.
For large datasets where performance is crucial.
In production environments to ensure data quality and consistency.



## Schame altername - String schema

In [0]:
schema_alternate =  "year STRING,first_name STRING,country STRING,sex STRING,count INTEGER"

In [0]:
df5 = spark.read.format("csv").option("header" , True).option("sep" , ',').schema(schema_alternate).load("dbfs:/FileStore/baby_names/")

df5.printSchema()

root
 |-- year: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- count: integer (nullable = true)

