<h2>Extract information from CSV files stored in Databricks and write it to the Delta file system.</h2>

Picks files up from the Databricks file system storage and writes it out to Delta file locations.

<h3>Payments</h3>

In [None]:
# Extract Payment data from csv file by creating a DataFrame named payment_df

payments_df = (
    spark.read.format("csv")
    .option("inferSchema", "true") #automatically infer the data types of each column
    .option("header", "false") # specifies whether the CSV file has a header row or not. 
    .option("sep", ",") #comma as the delimiter
    .load("dbfs:/FileStore/tables/payments.csv") #loads the CSV file into a DataFrame with the filepath specified
    .toDF("payment_id", "date", "amount", "rider_id") #renames the columns of the DataFrame
)

In [None]:
# Display the Schema of DataFrame created (first 10 rows)
display(payments_df.limit(10))
payments_df.printSchema


payment_id,date,amount,rider_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000
6,2019-10-01,9.0,1000
7,2019-11-01,9.0,1000
8,2019-12-01,9.0,1000
9,2020-01-01,9.0,1000
10,2020-02-01,9.0,1000


In [None]:
# Write data into the Delta location
payments_df.write.format("delta") # write the DataFrame payments_df into Delta Lake format.
                .mode("overwrite") 
                .save("/delta/payments")

<h3>Riders</h3>

In [None]:
# Extract Rider data from csv file by creating a DataFrame named rider_df
riders_df = (
    spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "false")
    .option("sep", ",")
    .load("dbfs:/FileStore/tables/riders.csv")
    .toDF(
        "rider_id",
        "first_name",
        "last_name",
        "address",
        "birthday",
        "account_start_date",
        "account_end_date",
        "is_member",
    )
)

In [None]:
# Display the Schema of DataFrame created (first 10 rows)

display(riders_df.limit(10))
riders_df.printSchema

rider_id,first_name,last_name,address,birthday,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,1974-08-27,2020-03-24,,False
1006,Alicia,Taylor,1137 Angela Locks,2004-01-30,2020-11-27,2021-12-01,True
1007,Benjamin,Fernandez,979 Phillips Ways,1988-01-11,2016-12-11,,False
1008,John,Crawford,7691 Evans Court,1987-02-21,2021-03-28,2021-07-01,True
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,1981-02-07,2020-06-12,2021-11-01,True


Out[16]: <bound method DataFrame.printSchema of DataFrame[rider_id: int, first_name: string, last_name: string, address: string, birthday: date, account_start_date: date, account_end_date: date, is_member: boolean]>

In [None]:
# Write data to delta location
riders_df.write.format("delta")
                .mode("overwrite")
                .save("/delta/riders")

<h3>Stations</h3>

In [None]:
# Extract Extraction data from csv file by creating a DataFrame named extraction_df
stations_df = (
    spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "false")
    .option("sep", ",")
    .load("dbfs:/FileStore/tables/stations.csv")
    .toDF("station_id", "name", "latitude", "longitude")
)

In [None]:
# Display the Schema of DataFrame created (first 10 rows)
display(stations_df.limit(10))
stations_df.printSchema

station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.012701,-87.66605799999999
KA1503000012,Clark St & Lake St,41.88579466666667,-87.63110066666668
637,Wood St & Chicago Ave,41.895634,-87.672069
13216,State St & 33rd St,41.8347335,-87.6258275
18003,Fairbanks St & Superior St,41.89580766666667,-87.62025316666669
KP1705001026,LaSalle Dr & Huron St,41.894877,-87.632326
13253,Lincoln Ave & Waveland Ave,41.948797,-87.675278
KA1503000044,Rush St & Hubbard St,41.890173,-87.62618499999999
KA1504000140,Winchester Ave & Elston Ave,41.92403733333333,-87.67641483333334
TA1305000032,Clinton St & Madison St,41.882242,-87.64106600000001


Out[19]: <bound method DataFrame.printSchema of DataFrame[station_id: string, name: string, latitude: double, longitude: double]>

In [None]:
# Write data to delta location
stations_df.write.format("delta").mode("overwrite").save("/delta/stations")

<h3>Trips</h3>

In [None]:
# Extract trips data from csv file by creating a DataFrame named trips_df
trips_df = (
    spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "false")
    .option("sep", ",")
    .load("dbfs:/FileStore/tables/trips.csv")
    .toDF(
        "trip_id",
        "rideable_type",
        "start_at",
        "ended_at",
        "start_station_id",
        "end_station_id",
        "rider_id",
    )
)

In [None]:
# Display the Schema of DataFrame created (first 10 rows)
display(trips_df.limit(10))
trips_df.printSchema

trip_id,rideable_type,start_at,ended_at,start_station_id,end_station_id,rider_id
89E7AA6C29227EFF,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660,71934
0FEFDE2603568365,classic_bike,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854
E6159D746B2DBB91,electric_bike,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870
B32D3199F1C2E75B,classic_bike,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974
83E463F23575F4BF,electric_bike,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608
BDAA7E3494E8D545,electric_bike,2021-02-24T15:43:33.000+0000,2021-02-24T15:49:05.000+0000,18003,KP1705001026,36267
A772742351171257,classic_bike,2021-02-01T17:47:42.000+0000,2021-02-01T17:48:33.000+0000,KP1705001026,KP1705001026,50104
295476889D9B79F8,classic_bike,2021-02-11T18:33:53.000+0000,2021-02-11T18:35:09.000+0000,18003,18003,19618
362087194BA4CC9A,classic_bike,2021-02-27T15:13:39.000+0000,2021-02-27T15:36:36.000+0000,KP1705001026,KP1705001026,16732
21630F715038CCB0,classic_bike,2021-02-20T08:59:42.000+0000,2021-02-20T09:17:04.000+0000,KP1705001026,KP1705001026,57068


Out[22]: <bound method DataFrame.printSchema of DataFrame[trip_id: string, rideable_type: string, start_at: timestamp, ended_at: timestamp, start_station_id: string, end_station_id: string, rider_id: int]>

In [None]:
# Write data to delta location
trips_df.write.format("delta").mode("overwrite").save("/delta/trips")