**Working with JSON file**

In [0]:
flight_raw_df = spark.read\
    .format("json")\
    .load("/Volumes/dev/multi_datasets/spark_data/flight-time.json")
display(flight_raw_df.limit(10))

ARR_TIME,CANCELLED,CRS_ARR_TIME,CRS_DEP_TIME,DEP_TIME,DEST,DEST_CITY_NAME,DISTANCE,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,TAXI_IN,WHEELS_ON
1348,0,1400,1115,1113,ATL,"Atlanta, GA",946,1/1/2000,DL,1451,BOS,"Boston, MA",5,1343
1543,0,1559,1315,1311,ATL,"Atlanta, GA",946,1/1/2000,DL,1479,BOS,"Boston, MA",7,1536
1651,0,1721,1415,1414,ATL,"Atlanta, GA",946,1/1/2000,DL,1857,BOS,"Boston, MA",9,1642
2005,0,2013,1715,1720,ATL,"Atlanta, GA",946,1/1/2000,DL,1997,BOS,"Boston, MA",10,1955
2240,0,2300,2015,2010,ATL,"Atlanta, GA",946,1/1/2000,DL,2065,BOS,"Boston, MA",10,2230
1003,0,955,650,649,ATL,"Atlanta, GA",946,1/1/2000,US,2619,BOS,"Boston, MA",7,956
1717,0,1738,1440,1446,ATL,"Atlanta, GA",946,1/1/2000,US,2621,BOS,"Boston, MA",4,1713
2006,0,2008,1740,1744,ATL,"Atlanta, GA",449,1/1/2000,DL,346,BTR,"Baton Rouge, LA",9,1957
1601,0,1622,1345,1345,ATL,"Atlanta, GA",449,1/1/2000,DL,412,BTR,"Baton Rouge, LA",9,1552
1448,0,1455,1245,1245,ATL,"Atlanta, GA",712,1/1/2000,DL,299,BUF,"Buffalo, NY",5,1443


In [0]:
flight_raw_df.printSchema()

root
 |-- ARR_TIME: long (nullable = true)
 |-- CANCELLED: long (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- DEP_TIME: long (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DISTANCE: long (nullable = true)
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- TAXI_IN: long (nullable = true)
 |-- WHEELS_ON: long (nullable = true)



**Automatic inferring schema is often incorrect. As in this case FL_DATE column is string but in real it is date, Cancelled column is boolean but here it is read as long. SO now we will enforce our own schema**

In [0]:
flight_schema = """
    FL_DATE date,
    OP_CARRIER string,
    OP_CARRIER_FL_NUM long,
    ORIGIN string,
    ORIGIN_CITY_NAME string,
    DEST string,
    DEST_CITY_NAME string,
    DEP_TIME long,
    ARR_TIME long,
    CRS_DEP_TIME long,
    CRS_ARR_TIME long,
    DISTANCE long,
    CANCELLED boolean,
    TAXI_IN long,
    WHEELS_ON long
"""

In [0]:
flight_raw_df1 = spark.read\
    .format("json")\
    .schema(flight_schema)\
    .load("/Volumes/dev/multi_datasets/spark_data/flight-time.json")
display(flight_raw_df1.limit(10))
flight_raw_df1.printSchema()

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,ARR_TIME,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,CANCELLED,TAXI_IN,WHEELS_ON
,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1113,1348,1115,1400,946,,5,1343
,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1311,1543,1315,1559,946,,7,1536
,DL,1857,BOS,"Boston, MA",ATL,"Atlanta, GA",1414,1651,1415,1721,946,,9,1642
,DL,1997,BOS,"Boston, MA",ATL,"Atlanta, GA",1720,2005,1715,2013,946,,10,1955
,DL,2065,BOS,"Boston, MA",ATL,"Atlanta, GA",2010,2240,2015,2300,946,,10,2230
,US,2619,BOS,"Boston, MA",ATL,"Atlanta, GA",649,1003,650,955,946,,7,956
,US,2621,BOS,"Boston, MA",ATL,"Atlanta, GA",1446,1717,1440,1738,946,,4,1713
,DL,346,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1744,2006,1740,2008,449,,9,1957
,DL,412,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1345,1601,1345,1622,449,,9,1552
,DL,299,BUF,"Buffalo, NY",ATL,"Atlanta, GA",1245,1448,1245,1455,712,,5,1443


root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: long (nullable = true)
 |-- ARR_TIME: long (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- DISTANCE: long (nullable = true)
 |-- CANCELLED: boolean (nullable = true)
 |-- TAXI_IN: long (nullable = true)
 |-- WHEELS_ON: long (nullable = true)



**The two columns FL_DATE and CANCELLED column are showing null values now as the dataframe reader could not load the data correctly. Hence the date, timestamp and the boolean values are often loaded incorrectly or they become null.**

**Json file has Flight date in MM/DD/YYYY format but default date format is YYYY-MM-DD format. JSON file has boolean as 0 & 1 but default value is TRUE & FALSE**

**So we will set a error message for this using dataframe reader mode option- FAILFAST**

In [0]:
flight_raw_df1 = spark.read\
    .format("json")\
    .schema(flight_schema)\
    .option("mode", "FAILFAST")\
    .load("/Volumes/dev/multi_datasets/spark_data/flight-time.json")
display(flight_raw_df1.limit(10))
flight_raw_df1.printSchema()

[0;31m---------------------------------------------------------------------------[0m
[0;31mSparkException[0m                            Traceback (most recent call last)
File [0;32m<command-4855986274339412>, line 6[0m
[1;32m      1[0m flight_raw_df1 [38;5;241m=[39m spark[38;5;241m.[39mread\
[1;32m      2[0m     [38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mjson[39m[38;5;124m"[39m)\
[1;32m      3[0m     [38;5;241m.[39mschema(flight_schema)\
[1;32m      4[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mmode[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mFAILFAST[39m[38;5;124m"[39m)\
[1;32m      5[0m     [38;5;241m.[39mload([38;5;124m"[39m[38;5;124m/Volumes/dev/multi_datasets/spark_data/flight-time.json[39m[38;5;124m"[39m)
[0;32m----> 6[0m display(flight_raw_df1[38;5;241m.[39mlimit([38;5;241m10[39m))
[1;32m      7[0m flight_raw_df1[38;5;241m.[39mprintSchema()

File [0;32m/databricks/python_shell/lib/dbruntime/display.py:

**To process the data we will set option as PERMISSIVE and proceed**

In [0]:
flight_raw_df1 = spark.read\
    .format("json")\
    .schema(flight_schema)\
    .option("mode", "PERMISSIVE")\
    .load("/Volumes/dev/multi_datasets/spark_data/flight-time.json")
display(flight_raw_df1.limit(10))
flight_raw_df1.printSchema()

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,ARR_TIME,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,CANCELLED,TAXI_IN,WHEELS_ON
,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1113,1348,1115,1400,946,,5,1343
,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1311,1543,1315,1559,946,,7,1536
,DL,1857,BOS,"Boston, MA",ATL,"Atlanta, GA",1414,1651,1415,1721,946,,9,1642
,DL,1997,BOS,"Boston, MA",ATL,"Atlanta, GA",1720,2005,1715,2013,946,,10,1955
,DL,2065,BOS,"Boston, MA",ATL,"Atlanta, GA",2010,2240,2015,2300,946,,10,2230
,US,2619,BOS,"Boston, MA",ATL,"Atlanta, GA",649,1003,650,955,946,,7,956
,US,2621,BOS,"Boston, MA",ATL,"Atlanta, GA",1446,1717,1440,1738,946,,4,1713
,DL,346,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1744,2006,1740,2008,449,,9,1957
,DL,412,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1345,1601,1345,1622,449,,9,1552
,DL,299,BUF,"Buffalo, NY",ATL,"Atlanta, GA",1245,1448,1245,1455,712,,5,1443


root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: long (nullable = true)
 |-- ARR_TIME: long (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- DISTANCE: long (nullable = true)
 |-- CANCELLED: boolean (nullable = true)
 |-- TAXI_IN: long (nullable = true)
 |-- WHEELS_ON: long (nullable = true)



**How to solve this issue--- First read FL_DATE and CANCELLED as string and then will change it later after reading**

In [0]:
flight_schema = """
    FL_DATE string,
    OP_CARRIER string,
    OP_CARRIER_FL_NUM long,
    ORIGIN string,
    ORIGIN_CITY_NAME string,
    DEST string,
    DEST_CITY_NAME string,
    DEP_TIME long,
    ARR_TIME long,
    CRS_DEP_TIME long,
    CRS_ARR_TIME long,
    DISTANCE long,
    CANCELLED string,
    TAXI_IN long,
    WHEELS_ON long
"""

In [0]:
flight_raw_df2 = spark.read\
    .format("json")\
    .schema(flight_schema)\
    .option("mode", "PERMISSIVE")\
    .load("/Volumes/dev/multi_datasets/spark_data/flight-time.json")
display(flight_raw_df2.limit(10))
flight_raw_df2.printSchema()

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,ARR_TIME,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,CANCELLED,TAXI_IN,WHEELS_ON
1/1/2000,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1113,1348,1115,1400,946,0,5,1343
1/1/2000,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1311,1543,1315,1559,946,0,7,1536
1/1/2000,DL,1857,BOS,"Boston, MA",ATL,"Atlanta, GA",1414,1651,1415,1721,946,0,9,1642
1/1/2000,DL,1997,BOS,"Boston, MA",ATL,"Atlanta, GA",1720,2005,1715,2013,946,0,10,1955
1/1/2000,DL,2065,BOS,"Boston, MA",ATL,"Atlanta, GA",2010,2240,2015,2300,946,0,10,2230
1/1/2000,US,2619,BOS,"Boston, MA",ATL,"Atlanta, GA",649,1003,650,955,946,0,7,956
1/1/2000,US,2621,BOS,"Boston, MA",ATL,"Atlanta, GA",1446,1717,1440,1738,946,0,4,1713
1/1/2000,DL,346,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1744,2006,1740,2008,449,0,9,1957
1/1/2000,DL,412,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1345,1601,1345,1622,449,0,9,1552
1/1/2000,DL,299,BUF,"Buffalo, NY",ATL,"Atlanta, GA",1245,1448,1245,1455,712,0,5,1443


root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: long (nullable = true)
 |-- ARR_TIME: long (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- DISTANCE: long (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- TAXI_IN: long (nullable = true)
 |-- WHEELS_ON: long (nullable = true)



In [0]:
from pyspark.sql.functions import to_date, expr
flight_raw_df3 = flight_raw_df2\
                    .withColumn("FL_DATE", to_date("FL_DATE", "M/d/y"))\
                    .withColumn("CANCELLED", expr("if(CANCELLED == '1', true, false)"))

In [0]:
display(flight_raw_df3.limit(10))
flight_raw_df3.printSchema()

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,ARR_TIME,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,CANCELLED,TAXI_IN,WHEELS_ON
2000-01-01,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1113,1348,1115,1400,946,False,5,1343
2000-01-01,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1311,1543,1315,1559,946,False,7,1536
2000-01-01,DL,1857,BOS,"Boston, MA",ATL,"Atlanta, GA",1414,1651,1415,1721,946,False,9,1642
2000-01-01,DL,1997,BOS,"Boston, MA",ATL,"Atlanta, GA",1720,2005,1715,2013,946,False,10,1955
2000-01-01,DL,2065,BOS,"Boston, MA",ATL,"Atlanta, GA",2010,2240,2015,2300,946,False,10,2230
2000-01-01,US,2619,BOS,"Boston, MA",ATL,"Atlanta, GA",649,1003,650,955,946,False,7,956
2000-01-01,US,2621,BOS,"Boston, MA",ATL,"Atlanta, GA",1446,1717,1440,1738,946,False,4,1713
2000-01-01,DL,346,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1744,2006,1740,2008,449,False,9,1957
2000-01-01,DL,412,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1345,1601,1345,1622,449,False,9,1552
2000-01-01,DL,299,BUF,"Buffalo, NY",ATL,"Atlanta, GA",1245,1448,1245,1455,712,False,5,1443


root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: long (nullable = true)
 |-- ARR_TIME: long (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- DISTANCE: long (nullable = true)
 |-- CANCELLED: boolean (nullable = false)
 |-- TAXI_IN: long (nullable = true)
 |-- WHEELS_ON: long (nullable = true)



**Congrats we solved the issue**

**Now the FLDATE is date and loaded correctly also CANCELLED is boolean with TRUE & FALSE value**

In [0]:
flight_raw_df3.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("flight_raw")

In [0]:
%sql
select * from flight_raw limit 10

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,ARR_TIME,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE,CANCELLED,TAXI_IN,WHEELS_ON
2000-01-01,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1113,1348,1115,1400,946,False,5,1343
2000-01-01,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1311,1543,1315,1559,946,False,7,1536
2000-01-01,DL,1857,BOS,"Boston, MA",ATL,"Atlanta, GA",1414,1651,1415,1721,946,False,9,1642
2000-01-01,DL,1997,BOS,"Boston, MA",ATL,"Atlanta, GA",1720,2005,1715,2013,946,False,10,1955
2000-01-01,DL,2065,BOS,"Boston, MA",ATL,"Atlanta, GA",2010,2240,2015,2300,946,False,10,2230
2000-01-01,US,2619,BOS,"Boston, MA",ATL,"Atlanta, GA",649,1003,650,955,946,False,7,956
2000-01-01,US,2621,BOS,"Boston, MA",ATL,"Atlanta, GA",1446,1717,1440,1738,946,False,4,1713
2000-01-01,DL,346,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1744,2006,1740,2008,449,False,9,1957
2000-01-01,DL,412,BTR,"Baton Rouge, LA",ATL,"Atlanta, GA",1345,1601,1345,1622,449,False,9,1552
2000-01-01,DL,299,BUF,"Buffalo, NY",ATL,"Atlanta, GA",1245,1448,1245,1455,712,False,5,1443


In [0]:
spark.catalog.listTables()

[Table(name='flight_raw', catalog='workspace', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False)]

In [0]:
%sql
describe extended flight_raw

col_name,data_type,comment
FL_DATE,date,
OP_CARRIER,string,
OP_CARRIER_FL_NUM,bigint,
ORIGIN,string,
ORIGIN_CITY_NAME,string,
DEST,string,
DEST_CITY_NAME,string,
DEP_TIME,bigint,
ARR_TIME,bigint,
CRS_DEP_TIME,bigint,


**Distinct destination city**

In [0]:
dist_city = flight_raw_df3.where("DEST_CITY_NAME is not null")\
                         .select(expr("DEST_CITY_NAME as distinct_destination_city"))\
                         .distinct()
display(dist_city.limit(10))

distinct_destination_city
"Newark, NJ"
"Fort Wayne, IN"
"Dayton, OH"
"Columbia, SC"
"Boston, MA"
"Denver, CO"
"Helena, MT"
"Chattanooga, TN"
"Harrisburg, PA"
"San Antonio, TX"


**What were the most common destinations**

In [0]:
common_dest = flight_raw_df3.select("DEST_CITY_NAME")\
                            .where("DEST_CITY_NAME is not null")\
                            .groupBy("DEST_CITY_NAME")\
                            .count()\
                            .orderBy("count", ascending=False)\
                            .limit(10)
display(common_dest)

DEST_CITY_NAME,count
"Chicago, IL",18409
"Atlanta, GA",14492
"Dallas/Fort Worth, TX",13246
"Los Angeles, CA",11151
"Houston, TX",10092
"Phoenix, AZ",9914
"St. Louis, MO",9180
"Washington, DC",8520
"Detroit, MI",8171
"New York, NY",7803


**MAGIC COMMANDS**

In [0]:
%sh
ls /Volumes


core_bronze_dev
core_silver_dev
demo_catalog
dev
samples
system
workspace


In [0]:
%python
spark


<pyspark.sql.connect.session.SparkSession at 0xffde23392570>

In [0]:
%sql
SHOW CATALOGS;


catalog
core_bronze_dev
core_silver_dev
demo_catalog
dev
samples
system
workspace


**detailed list of files**

In [0]:
%sh
ls -lh


total 124K
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 14_days-AI_Challenge
-rwxrwxrwx 1 root root 2.1K Aug 27 06:01 ch2_Assignment-Solutions-1.ipynb
-rwxrwxrwx 1 root root 4.1K Aug 27 08:37 ch2_Assignment-Solutions-2.ipynb
-rwxrwxrwx 1 root root 6.9K Aug 27 08:38 ch2_Assignment-Solutions-3.ipynb
-rwxrwxrwx 1 root root  16K Aug 27 09:48 ch2_Assignment-Solutions-4-5.ipynb
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 data
-rwxrwxrwx 1 root root  23K Jan 11 13:33 Day_2_challenge.ipynb
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 Drafts
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 Mastering PySpark
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 my_pipeline
-rwxrwxrwx 1 root root  18K Aug 29 09:35 mypracticenotebook.ipynb
-rwxrwxrwx 1 root root 2.4K Dec  8 18:44 New Query 2025-12-08 11:37pm.dbquery.ipynb
-rwxrwxrwx 1 root root 4.3K Dec 16 09:07 PII_notebook.ipynb
drwxrwxrwx 2 root root 4.0K Jan 11 13:33 Preethi
-rwxrwxrwx 1 root root  832 Jan 11 13:17 Python Practice.ipynb
-rwxrwxrwx 1 root root 7.9K Oct 11 16:

**change directory**

In [0]:
%sh
cd /Volumes/workspace/ecommerce/ecommerce_data


bash: line 1: cd: /Volumes/workspace/ecommerce/ecommerce_data: No such file or directory


In [0]:
%sh
mkdir test_folder    # creating directory


In [0]:
%sh
mkdir -p data/raw/2024    # creating nested directory


In [0]:
%sh
rm -r test_folder    # deleting directory recursively


In [0]:
%sh
head -5 /Volumes/dev/multi_datasets/spark_data/firedept.csv


"Call Number","Unit ID","Incident Number","Call Type","Call Date","Watch Date","Received DtTm","Entry DtTm","Dispatch DtTm","Response DtTm","On Scene DtTm","Transport DtTm","Hospital DtTm","Call Final Disposition","Available DtTm","Address","City","Zipcode of Incident","Battalion","Station Area","Box","Original Priority","Priority","Final Priority","ALS Unit","Call Type Group","Number of Alarms","Unit Type","Unit sequence in call dispatch","Fire Prevention District","Supervisor District","Neighborhooods - Analysis Boundaries","RowID","case_location","data_as_of","data_loaded_at"
"160943727","53","16037460","Medical Incident","04/03/2016","04/03/2016","2016 Apr 03 11:15:12 PM","2016 Apr 03 11:18:05 PM","2016 Apr 03 11:18:33 PM","2016 Apr 03 11:18:45 PM","2016 Apr 03 11:35:10 PM","2016 Apr 03 11:46:08 PM","2016 Apr 04 12:11:46 AM","Code 2 Transport","2016 Apr 04 12:47:29 AM","POLK ST/CEDAR ST","San Francisco","94109","B04","03","3121","2","2","2","true","Non Life-threatening","1","MEDIC"

In [0]:
%sh
grep "Call Number" /Volumes/dev/multi_datasets/spark_data/firedept.csv



"Call Number","Unit ID","Incident Number","Call Type","Call Date","Watch Date","Received DtTm","Entry DtTm","Dispatch DtTm","Response DtTm","On Scene DtTm","Transport DtTm","Hospital DtTm","Call Final Disposition","Available DtTm","Address","City","Zipcode of Incident","Battalion","Station Area","Box","Original Priority","Priority","Final Priority","ALS Unit","Call Type Group","Number of Alarms","Unit Type","Unit sequence in call dispatch","Fire Prevention District","Supervisor District","Neighborhooods - Analysis Boundaries","RowID","case_location","data_as_of","data_loaded_at"


In [0]:
%sh
cp /Volumes/dev/multi_datasets/spark_data/firedept.csv \
   /Volumes/core_bronze_dev/my_bronze_schema/my_volume/
