In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder \
            .appName("day6") \
            .getOrCreate()

# Add configuration for accessing S3

In [0]:
aws_access_key = ''
aws_secret_key = ''
spark._jsc.hadoopConfiguration().set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1') 
spark._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)

# Data Pre-processing:
### For simplicity, read files from S3 and join them to create an aggregate for this example.

In [0]:
schema = StructType([StructField("zip", StringType(), True),\
    StructField("business", StringType(), True),\
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True)])

In [0]:
business = spark.read.schema(schema).csv("s3://usfca-msan694/SF_business/filtered_registered_business_sf.tsv", sep='\t')

In [0]:
business.show()

In [0]:
schema = StructType([StructField("zip", StringType(), True),\
    StructField("supervisor", StringType(), True)])

supervisor = spark.read.schema(schema).csv("s3://usfca-msan694/SF_business/supervisor_sf.tsv", sep='\t')
supervisor.show()

In [0]:
joined_df = business.join(supervisor, 'zip', 'left')

In [0]:
joined_df.show()

# Connect to MongoDB
## Store aggregates in the database and re-read for machine learning later

In [0]:
database = 'msds697'
collection = 'business'
user_name = ''
password = ''
address = 'msds697-project.qh1ug.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
connection_string

In [0]:
joined_df.write.format("mongo").option("uri",connection_string).mode("append").save()

In [0]:
df = spark.read.format("mongo").option("uri",connection_string).load()

In [0]:
df.show()