In [1]:
import json
import tempfile
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrameWriter, DataFrameReader
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
def get_config():
    with open("secret.json", "r") as f:
        jsonstr = f.read()
        conf = json.loads(jsonstr)
        return conf

def get_spark_conf(config):
    '''set config'''
    conf = SparkConf()
    conf.setAppName('yelp')
    conf.set('spark.master', config["spark"]["master_url"])
    return conf

def get_pg_props(config):
    '''set psql properties'''
    props = {
        "user": config["postgres"]["user"],
        "password": config["postgres"]["password"],
        "driver": "org.postgresql.Driver",
    }
    return props

In [3]:
def getdf(sql_context, config):
    '''filter yelp dataset'''
    yelp_business = sql_context.read.json(config["s3"]["yelpurl"])
    yelp_business_f = yelp_business[['name', 'latitude', 'longitude',
                                    'stars', 'review_count', 'address',
                                     'city', 'state','categories']]
    yelp_business_f.printSchema()
    return yelp_business_f

In [4]:
def write_to_pg(yelp_business_f, config):
    '''write to psql'''
    url = "jdbc:postgresql://10.0.0.14/xyn"
    table = 'y_business'
    props = get_pg_props(config)
    yelp_business_f.write.jdbc(url=url, table=table, mode='overwrite', properties=props)

In [5]:
config = get_config()
spark_conf = get_spark_conf(config)
sc = SparkContext(conf=spark_conf)

In [6]:
sql_context = SQLContext(sc)
yelp_business_f = getdf(sql_context, config)
write_to_pg(yelp_business_f, config)

root
 |-- name: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- categories: string (nullable = true)



In [7]:
toronto_mexican= yelp_business_f.where("categories like '%Mexican%'").where("city like '%Toronto%'").filter(yelp_business_f.stars>3.5)

In [8]:
toronto_mexican.show(n=5, vertical = True)

-RECORD 0----------------------------
 name         | Burrito Boyz         
 latitude     | 43.7729924508        
 longitude    | -79.4140518612       
 stars        | 4.0                  
 review_count | 121                  
 address      | 5314 Yonge Street    
 city         | Toronto              
 state        | ON                   
 categories   | Mexican, Restaurants 
-RECORD 1----------------------------
 name         | MEXITACO             
 latitude     | 43.7088142           
 longitude    | -79.2959772          
 stars        | 4.0                  
 review_count | 65                   
 address      | 1109 Victoria Par... 
 city         | Toronto              
 state        | ON                   
 categories   | Mexican, Nightlif... 
-RECORD 2----------------------------
 name         | Huevos Gourmet       
 latitude     | 43.6012391           
 longitude    | -79.5037742          
 stars        | 4.5                  
 review_count | 149                  
 address    

In [9]:
toronto_mexican.count()

76

In [10]:
def write_to_pg1(toronto_mexican, config):
    '''write to psql'''
    url = "jdbc:postgresql://10.0.0.14/xyn"
    table = 'toronto_mexican'
    props = get_pg_props(config)
    toronto_mexican.write.jdbc(url=url, table=table, mode='overwrite', properties=props)

In [11]:
write_to_pg1(toronto_mexican, config)

In [12]:
toronto= yelp_business_f.where("city like '%Toronto%'")

In [13]:
toronto.count()

18244

In [14]:
def getdf(sql_context, config): 
    '''filter abb dataset'''
    toronto_listings = sql_context.read.format('com.databricks.spark.csv').options(header='true').load(config["s3"]["abb_toronto_listings"])
    toronto_listings_f = toronto_listings['id', 'latitude', 'longitude',
                                    'price', 'number_of_reviews']
    return toronto_listings_f

In [15]:
toronto_listings_f = getdf(sql_context, config)

In [16]:
toronto_listings_f.show(n=5, vertical = True)

-RECORD 0-------------------------------
 id                | 1419               
 latitude          | 43.646167661556824 
 longitude         | -79.42451012783086 
 price             | 469                
 number_of_reviews | 7                  
-RECORD 1-------------------------------
 id                | 8077               
 latitude          | 43.64105126982716  
 longitude         | -79.37627700577787 
 price             | 100                
 number_of_reviews | 170                
-RECORD 2-------------------------------
 id                | 10314              
 latitude          | 43.670185503701774 
 longitude         | -79.33584783585275 
 price             | 69                 
 number_of_reviews | 77                 
-RECORD 3-------------------------------
 id                | 12604              
 latitude          | 43.667240686103376 
 longitude         | -79.41597756109105 
 price             | 67                 
 number_of_reviews | 0                  
-RECORD 4-------

In [17]:
def dist(long_x, lat_x, long_y, lat_y):
    return acos(
        sin(toRadians(lat_x)) * sin(toRadians(lat_y)) + 
        cos(toRadians(lat_x)) * cos(toRadians(lat_y)) * 
            cos(toRadians(long_x) - toRadians(long_y))
    ) * lit(6371.0)

In [21]:
d1 = dist(toronto_mexican.longitude, toronto_mexican.latitude, toronto_listings_f.longitude, toronto_listings_f.latitude)

In [22]:
type(d1)

pyspark.sql.column.Column

In [23]:
d1.show(n=5)

TypeError: 'Column' object is not callable