In [0]:
%sql
create catalog if not exists telecom_catalog_assign

In [0]:
%sql
create schema if not exists telecom_catalog_assign.landing_zone

In [0]:
%sql
create volume if not exists telecom_catalog_assign.landing_zone.landing_vol

In [0]:
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/")

In [0]:
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/")

In [0]:
dbutils.fs.mkdirs("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/")

In [0]:
customer_csv = ''' 101,Arun,31,Chennai,PREPAID 
102,Meera,45,Bangalore,POSTPAID 
103,Irfan,29,Hyderabad,PREPAID 
104,Raj,52,Mumbai,POSTPAID 
105,,27,Delhi,PREPAID 
106,Sneha,abc,Pune,PREPAID '''

usage_tsv = '''customer_id\tvoice_mins\tdata_mb\tsms_count 
101\t320\t1500\t20 
102\t120\t4000\t5 
103\t540\t600\t52 
104\t45\t200\t2 
105\t0\t0\t0 '''

tower_logs_region1 = '''event_id|customer_id|tower_id|signal_strength|timestamp 
5001|101|TWR01|-80|2025-01-10 10:21:54 
5004|104|TWR05|-75|2025-01-10 11:01:12 '''

tower_logs_region2 = '''event_id|customer_id|tower_id|signal_strength|timestamp 
5002|102|TWR02|-80|2025-01-10 10:21:54 
5003|103|TWR03|-75|2025-01-10 11:01:12 '''

In [0]:
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv",customer_csv,overwrite=True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv",usage_tsv,overwrite=True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/tower_logs_region1.csv",tower_logs_region1,overwrite=True)
dbutils.fs.put("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/tower_logs_region2.csv",tower_logs_region2,overwrite=True)

In [0]:
%sh

ls "/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/"

In [0]:
df1=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/", header=True,sep="|",inferSchema=True, pathGlobFilter="tower*.csv",recursiveFileLookup=True)
df1.show()
df1.printSchema()

In [0]:
df2=spark.read.csv(path=["/Volumes/telecom_catalog_assign/landing_zone/landing_vol/","/Volumes/lakehouse/default/volume1/"], header=True,sep="|",inferSchema=True, pathGlobFilter="tower*.csv",recursiveFileLookup=True)
df2.show()

In [0]:
df3=spark.read.format("csv").option("header",True).option("sep","|").option("inferSchema",True).option("pathGlobFilter","tower*.csv").option("RecursiveFileLookup",True).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/")
df3.show()

In [0]:
df4=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv", header=True, inferSchema=True)
df4.show()

In [0]:
df4=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv", header=False, inferSchema=False)
df4.show()

In [0]:
df5= spark.read.format("csv").option("header",True).option("inferSchema",True).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv")
df5.show()

In [0]:
df5= spark.read.format("csv").options(header=True,inferSchema=True).load("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv")
df5.show()

**Write a note on What changed when we use header or inferSchema with true/false?**<br>
- When header=True is specified and the customer_csv file has no header row, Spark uses the first data row as column names. When inferSchema=True is enabled, Spark infers column data types by scanning the full dataset. Because the age column includes a non-numeric value ("abc"), it is inferred as String type.
- When header=False is specified, Spark uses default column names (_c0, _c1, _c2, _c3, _c4) and treats the header row as a data row. When inferSchema=False is specified, all columns are inferred as String type by default.

**How schema inference handled “abc” in age?**<br>

The age column is inferred as a string data type because one of the records has an non-numeric value ("abc").

In [0]:
#Apply column names using string using toDF function for customer data
df6=spark.read.csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/customer/customer_csv.csv", header=False, inferSchema=False).toDF("id","name","age","city","type")
df6.show()


In [0]:
#Apply column names and datatype using the schema function for usage data
schema1="customer_id integer,voice_mins integer,	data_mb	integer,sms_count integer"
df7=spark.read.schema(schema1).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/usage/usage_tsv.csv",header=True,sep="\t")
df7.show()
df7.printSchema()

In [0]:
#Apply column names and datatype using the StructType with IntegerType, StringType, TimestampType and other classes for towers data
from pyspark.sql.types import StructType,IntegerType,StringType,TimestampType,StructField
Struct_str=StructType([StructField("id",IntegerType(),True),StructField("customer_id",IntegerType(),False),StructField("tower_id",StringType(),False),StructField("signal_strength",IntegerType(),True),StructField("timestamp",TimestampType(),True)])
df8=spark.read.schema(Struct_str).csv("/Volumes/telecom_catalog_assign/landing_zone/landing_vol/tower/tower_logs_region1.csv",sep="|",header=True)
df8.show()