In [1]:
import json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrameWriter, DataFrameReader
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
env = "development"

In [3]:
def get_config():
    with open("../config.json", "r") as f:
        jsonstr = f.read()
        conf = json.loads(jsonstr)
        return conf

In [4]:
def get_spark_conf(config):
    '''set config'''
    conf = SparkConf()
    conf.setAppName('abb_t')
    conf.set('spark.master', config["spark"]["master_url"])
    return conf

In [5]:
def get_pg_props(config):
    '''set psql properties'''
    props = {
        "user": config["postgres"]["user"],
        "password": config["postgres"]["password"],
        "driver": "org.postgresql.Driver",
    }
    return props

In [6]:
def getdf(sql_context, config, city): 
    '''filter abb dataset'''
    
    df = sql_context \
        .read.format('csv').options(header='true') \
        .load(config["abb"][city]["s3"])
    return df

In [15]:
def append_to_pg(df, config):
    '''write to psql'''
    url = config["postgres"][env]["jdbc"]
    props = get_pg_props(config)
    df.write.jdbc(url, table="abb", mode="append", properties=props)

In [8]:
config = get_config()
spark_conf = get_spark_conf(config)
sc = SparkContext(conf=spark_conf)

In [9]:
sql_context = SQLContext(sc)

In [20]:
selected_columns = ["name", "city", "latitude", "longitude", "price", "number_of_reviews"]

In [21]:
def get_city_df(sql_context, config, city):
    df = getdf(sql_context, config, city=city)
    city_listings = df.withColumn("latitude", df["latitude"].cast(DoubleType())) \
                      .withColumn("longitude", df["longitude"].cast(DoubleType())) \
                      .withColumn("price", df["price"].cast(IntegerType())) \
                      .withColumn("number_of_reviews", df["number_of_reviews"].cast(IntegerType()))\
                      .withColumn("city", lit(city))[selected_columns]
    return city_listings

In [22]:
for city in ["Toronto"]:
    df = get_city_df(sql_context, config, city)
    append_to_pg(df, config)