In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrameWriter, DataFrameReader
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [18]:
from sqlalchemy import Table, Column, Integer, Float, String, MetaData, create_engine
from geoalchemy2 import Geography
from sqlalchemy.sql import select
from sqlalchemy import func

In [4]:
env = "development"

In [5]:
def get_config():
    with open("../config.json", "r") as f:
        jsonstr = f.read()
        conf = json.loads(jsonstr)
        return conf

In [6]:
def get_spark_conf(config):
    '''set config'''
    conf = SparkConf()
    conf.setAppName('abb_t')
    conf.set('spark.master', config["spark"]["master_url"])
    return conf

In [7]:
def get_pg_props(config):
    '''set psql properties'''
    props = {
        "user": config["postgres"]["user"],
        "password": config["postgres"]["password"],
        "driver": "org.postgresql.Driver",
    }
    return props

In [8]:
config = get_config()
spark_conf = get_spark_conf(config)
sc = SparkContext(conf=spark_conf)

In [9]:
sql_context = SQLContext(sc)

In [10]:
city = "Toronto"
category = "Restaurants"

In [11]:
abb_df = DataFrameReader(sql_context).jdbc(url=config["postgres"][env]["jdbc"], 
                                           table='abb', 
                                           properties=get_pg_props(config))

In [12]:
abb_df = abb_df.filter(abb_df.city == city)

In [13]:
metadata = MetaData()

In [14]:
yelp_table = Table('yelp', metadata,
                   Column('id', Integer, primary_key=True),
                   Column('name', String),
                   Column('latitude', Float),
                   Column('longitude', Float),
                   Column('stars', Float),
                   Column('review_count', Integer),
                   Column('address', String),
                   Column('city', String),
                   Column('state', String))

In [15]:
categories_table = Table('categories', metadata,
                         Column('id', Integer, primary_key=True),
                         Column('name', String))

In [16]:
yelp2category_table = Table('yelp2category', metadata,
                            Column('yelp_id', Integer),
                            Column('category_id', String))

In [17]:
yelp_geo_table = Table('yelp_geo', metadata,
                        Column('yelp_id', Integer),
                        Column('location', Geography(geometry_type='POINT', srid=4326)))

In [81]:
distance_threshold = 500

In [29]:
engine = create_engine(config["postgres"][env]["url"])

  """)


In [30]:
conn = engine.connect()

In [91]:
score_dict = {10: 1000, 9: 100, 8: 10, 7:1}

In [93]:
compute_score(result)

11038

In [103]:
def compute_listing_score(listing, city, category):
    def compute_score(selection_result):
        score = 0
        for row in selection_result:
            doublestar = int(row.stars*2)
            if doublestar in score_dict:
                score += score_dict[doublestar]
        return score    
    
    if listing.longitude == None or listing.latitude == None:
        return 0
    selection = select([yelp_table.c.stars])\
        .select_from(yelp_table.join(yelp2category_table, yelp_table.c.id == yelp2category_table.c.yelp_id)\
                               .join(categories_table, yelp2category_table.c.category_id == categories_table.c.id)
                               .join(yelp_geo_table, yelp_geo_table.c.yelp_id == yelp_table.c.id))\
        .where(yelp_table.c.city == city)\
        .where(categories_table.c.name == category)\
        .where(func.ST_Distance(f'POINT({listing.longitude} {listing.latitude})',
                                yelp_geo_table.c.location) <= distance_threshold)\
        .where(yelp_table.c.stars > 3)  
    
    result = conn.execute(selection)
    return compute_score(result)

In [125]:
def write_score_table(listings, city, category, debug=False):
    def prepare_score_rows():
        rows = []
        progress = 0
        for listing in listings:
            score = compute_listing_score(listing)
            rows.append({"abb_id": listing.id, "score": score})
            progress += 1
            if debug and progress % 1000 == 0:
                print("progress:", progress) 
        return rows
    
    rows = prepare_score_rows()
    table_name = f"abb_{category.lower()}"
    score_table = Table(table_name, metadata,
                        Column('abb_id', Integer, primary_key=True),
                        Column('score', Integer),
                        keep_existing=True)
    conn.execute(score_table.insert(), rows)

In [128]:
write_score_table(abb_df.collect(), "Toronto", "Restaurants", debug=True)

progress: 1000
progress: 2000
progress: 3000
progress: 4000
progress: 5000
progress: 6000
progress: 7000
progress: 8000
progress: 9000
progress: 10000
progress: 11000
progress: 12000
progress: 13000
progress: 14000
progress: 15000
progress: 16000
progress: 17000
progress: 18000
progress: 19000
