In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, DataFrameWriter, DataFrameReader
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
from sqlalchemy import Table, Column, Integer, Float, String, MetaData, create_engine
from geoalchemy2 import Geography
from sqlalchemy.sql import select
from sqlalchemy import func

In [4]:
env = "development"

In [5]:
def get_config():
    with open("../config.json", "r") as f:
        jsonstr = f.read()
        conf = json.loads(jsonstr)
        return conf

In [6]:
def get_spark_conf(config):
    '''set config'''
    conf = SparkConf()
    conf.setAppName('abb_t')
    conf.set('spark.master', config["spark"]["master_url"])
    return conf

In [7]:
def get_pg_props(config):
    '''set psql properties'''
    props = {
        "user": config["postgres"]["user"],
        "password": config["postgres"]["password"],
        "driver": "org.postgresql.Driver",
    }
    return props

In [8]:
config = get_config()
spark_conf = get_spark_conf(config)
sc = SparkContext(conf=spark_conf)

In [9]:
metadata = MetaData()

In [10]:
yelp_table = Table('yelp', metadata,                   
                   Column('id', Integer, primary_key=True),
                   Column('name', String),
                   Column('latitude', Float),
                   Column('longitude', Float),
                   Column('stars', Float),
                   Column('review_count', Integer),
                   Column('address', String),
                   Column('city', String),
                   Column('state', String),
                   keep_existing=True)

In [11]:
categories_table = Table('categories', metadata,
                         Column('id', Integer, primary_key=True),
                         Column('name', String),
                         keep_existing=True)

In [12]:
yelp2category_table = Table('yelp2category', metadata,
                            Column('yelp_id', Integer),
                            Column('category_id', String),
                            keep_existing=True)

In [13]:
yelp_geo_table = Table('yelp_geo', metadata,
                        Column('yelp_id', Integer),
                        Column('location', Geography(geometry_type='POINT', srid=4326)),
                        keep_existing=True)

In [27]:
abb_table = Table('abb', metadata,
                  Column('id', Integer, primary_key=True),
                  Column('name', String),
                  Column('latitude', Float),
                  Column('longitude', Float),
                  Column('price', Integer),
                  Column('number_of_reviews', Integer),
                  Column('city', String),
                  keep_existing=True)            

In [14]:
distance_threshold = 500

In [15]:
score_dict = {10: 1000, 9: 100, 8: 10, 7:1}

In [21]:
sql_context = SQLContext(sc)

In [22]:
def compute_listing_score(conn, listing, city, category):
    def compute_score(selection_result):
        score = 0
        for row in selection_result:
            doublestar = int(row.stars*2)
            if doublestar in score_dict:
                score += score_dict[doublestar]
        return score    
    
    if listing.longitude == None or listing.latitude == None:
        return 0
    selection = select([yelp_table.c.stars])\
        .select_from(yelp_table.join(yelp2category_table, yelp_table.c.id == yelp2category_table.c.yelp_id)\
                               .join(categories_table, yelp2category_table.c.category_id == categories_table.c.id)
                               .join(yelp_geo_table, yelp_geo_table.c.yelp_id == yelp_table.c.id))\
        .where(yelp_table.c.city == city)\
        .where(categories_table.c.name == category)\
        .where(func.ST_Distance(f'POINT({listing.longitude} {listing.latitude})',
                                yelp_geo_table.c.location) <= distance_threshold)\
        .where(yelp_table.c.stars > 3)  
    
    result = conn.execute(selection)
    return compute_score(result)

In [23]:
def write_score_table(conn, listings, city, category, debug=False):
    def prepare_score_rows():
        rows = []
        progress = 0
        for listing in listings:
            score = compute_listing_score(conn, listing, city, category)
            rows.append({"abb_id": listing.id, "score": score})
            progress += 1
            if debug and progress % 1000 == 0:
                print("progress:", progress) 
        return rows
    
    rows = prepare_score_rows()
    table_name = f"abb_{category.lower()}"
    score_table = Table(table_name, metadata,
                        Column('abb_id', Integer, primary_key=True),
                        Column('score', Integer),
                        keep_existing=True)
    conn.execute(score_table.insert(), rows)

In [34]:
def run_city_category(city_category_pair):
    engine = create_engine(config["postgres"][env]["url"])
    conn = engine.connect()
    city, category = city_category_pair
    
    selection = select([abb_table.c.id, abb_table.c.name, abb_table.c.longitude, abb_table.c.latitude])\
        .select_from(abb_table)\
        .where(abb_table.c.city == city)\
        .limit(10)
    listings_result = conn.execute(selection)
    scores = []
    for listing in listings_result:
        scores.append({"name": listing.name, "score": compute_listing_score(conn, listing, city, category)})
    return scores
    #write_score_table(df, city, category, debug=True)

In [35]:
pairs = [("Toronto", "Restaurants"), ("Toronto", "Shopping")]
rdd = sc.parallelize(pairs)
res = rdd.map(run_city_category).collect()

In [36]:
print(res)

[[{'name': 'Beautiful home in amazing area!', 'score': 11039}, {'name': 'Downtown Harbourfront Private Room', 'score': 104}, {'name': 'Union Inn II (Leslieville)', 'score': 390}, {'name': 'Seaton Village Parlour Bedroom', 'score': 5416}, {'name': 'Queen Bedroom close to downtown', 'score': 312}, {'name': 'World Class downtown@ CN Tower! Jays, TIFF, ACC', 'score': 3899}, {'name': 'Executive Studio Unit- Ideal for One Person', 'score': 1650}, {'name': 'Luxury,Safety, Affordability For Women Travellers!', 'score': 3340}, {'name': 'Downtown Toronto - Waterview Condo', 'score': 104}, {'name': 'Entire Suite in Downtown+Parking', 'score': 33}], [{'name': 'Beautiful home in amazing area!', 'score': 10963}, {'name': 'Downtown Harbourfront Private Room', 'score': 2003}, {'name': 'Union Inn II (Leslieville)', 'score': 1011}, {'name': 'Seaton Village Parlour Bedroom', 'score': 6018}, {'name': 'Queen Bedroom close to downtown', 'score': 1000}, {'name': 'World Class downtown@ CN Tower! Jays, TIFF, A