In [0]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# AWS S3 bucket name
AWS_S3_BUCKET = "user-0a8597384a69-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/0a8597384a69-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)


In [0]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/0a8597384a69-bucket/topics/0a8597384a69.{}/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("pin"))
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("geo"))
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location.format("user"))
# Display Spark dataframe to check its content
display(df_pin)
display(df_geo)
display(df_user)

age,date_joined,first_name,index,last_name
21,2015-11-10T09:27:42,Andrea,8731,Alexander
24,2016-03-31T20:56:39,Austin,8887,Rodriguez
36,2015-12-20T16:38:13,Michelle,4315,Prince
32,2017-10-10T20:09:33,Christian,10625,Lang
22,2016-02-11T20:46:04,Jennifer,9672,Hudson
32,2016-04-02T03:51:23,Brittany,1313,Jones
34,2016-12-22T00:02:02,Thomas,10794,Turner
20,2016-01-07T19:49:22,David,2959,Griffith
26,2015-12-20T10:28:00,Brendan,9875,Joseph
21,2016-01-03T15:42:12,Annette,2074,Forbes


index,latitude,longitude,timestamp
10794,-89.5236,-154.567,2022-01-01T02:26:50
10625,-84.4944,-81.0613,2018-07-13T11:51:15
7528,-89.9787,-173.293,2020-08-28T03:52:47
2863,-5.34445,-177.924,2020-04-27T13:34:16
5494,-82.6768,-129.202,2021-07-21T02:02:35
5069,-63.0063,-157.474,2021-03-20T09:32:44
2923,-84.6302,-164.507,2019-09-08T22:53:09
3089,-89.9787,-173.293,2018-02-28T05:31:29
6063,-89.1797,-174.015,2021-07-20T09:02:47
3454,-0.375174,49.8106,2021-07-25T02:20:29


age,date_joined,first_name,index,last_name
21,2015-11-10T09:27:42,Andrea,8731,Alexander
24,2016-03-31T20:56:39,Austin,8887,Rodriguez
36,2015-12-20T16:38:13,Michelle,4315,Prince
32,2017-10-10T20:09:33,Christian,10625,Lang
22,2016-02-11T20:46:04,Jennifer,9672,Hudson
32,2016-04-02T03:51:23,Brittany,1313,Jones
34,2016-12-22T00:02:02,Thomas,10794,Turner
20,2016-01-07T19:49:22,David,2959,Griffith
26,2015-12-20T10:28:00,Brendan,9875,Joseph
21,2016-01-03T15:42:12,Annette,2074,Forbes
