In [4]:
# question 1
# header of DataSample.csv manually edited to remove an unexpected space
# header of POIList.csv manually edited to remove an unexpected space
# and to provide header names different from DataSample.csv
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]") \
                    .getOrCreate()

df = spark.read.option("header",True) \
    .csv("DataSample.csv")
df.createOrReplaceTempView("data")

# clean data by collecting duplicates together
# and only keeping the first entry
df = spark.sql("SELECT * FROM data x \
                    WHERE x._ID IN \
                      (SELECT _ID FROM \
                         (SELECT _ID, ROW_NUMBER() OVER \
                            (PARTITION BY data.TimeSt, data.Country, data.Province, data.City, \
                                data.Latitude, data.Longitude ORDER BY _ID) duplicate \
                            FROM data) \
                            WHERE duplicate < 2);")

df.write.option("header",True) \
 .csv("cleaned-data")

In [9]:
# question 2
import pyspark
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from geopy import distance

spark = SparkSession.builder.master("local[*]") \
                    .getOrCreate()

requests = spark.read.options(header='True',inferSchema='True',delimiter=',') \
    .csv("cleaned-data.csv")
requests.createOrReplaceTempView("data")

pois = spark.read.options(header='True',inferSchema='True',delimiter=',') \
    .csv("POIList.csv")
pois.createOrReplaceTempView("interestpoints")

# remove duplicate POIs from the list of POIs
# depending on what POIs represent, we might understand that
# there could be multiple POIs in one location, and requests
# should be distributed evenly among them, but here I will
# assume that duplicates are just a mistake
pois = spark.sql("SELECT * FROM interestpoints x \
                    WHERE x.POIID IN \
                      (SELECT POIID FROM \
                         (SELECT POIID, ROW_NUMBER() OVER \
                            (PARTITION BY interestpoints.POILatitude, \
                            interestpoints.POILongitude ORDER BY POIID) duplicate \
                            FROM interestpoints) \
                            WHERE duplicate < 2);")

# redefined distance function so it's usable in dataframe
def dist(a,b,x,y):
    return distance.distance((a,b),(x,y)).km
udf_dist = udf(dist, FloatType())

# create a new dataframe of size count(POIs)*requests, 
# annotated with distance between each request and each POI
distances = requests.join(pois).withColumn('Distance', udf_dist(requests.Latitude,requests.Longitude, \
                                                                pois.POILatitude,pois.POILongitude))
# find min distance per request
min_distances = distances.groupBy('_ID').min('Distance')
# annotate distances with min distances, then filter out all non-min POIs
# and delete unnecessary columns
distances = distances.join(min_distances, distances._ID == min_distances._ID).select(distances["*"],min_distances["min(Distance)"])
distances = distances.filter(distances["Distance"]==distances["min(Distance)"])
distances = distances.drop("min(Distance)", "POILatitude", "POILongitude")

distances.write.option("header",True) \
 .csv("assigned-data")

In [11]:
# question 3
import folium
import pyspark
from pyspark.sql.types import FloatType
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import math

spark = SparkSession.builder.master("local[*]") \
                    .getOrCreate()

df = spark.read.options(header='True',inferSchema='True',delimiter=',') \
    .csv("assigned-data.csv")
df.createOrReplaceTempView("data")

pois = spark.read.options(header='True',inferSchema='True',delimiter=',') \
    .csv("POIList.csv")
pois.createOrReplaceTempView("interestpoints")

# filter out implausible points via an extremes box around Canada
df = df.filter(df["Latitude"]>41.681389)
df = df.filter(df["Latitude"]<83.111389)
df = df.filter(df["Longitude"]>-141.001944)
df = df.filter(df["Longitude"]<-52.619444)

# remove repeat code when combining later
pois = spark.sql("SELECT * FROM interestpoints x \
                    WHERE x.POIID IN \
                      (SELECT POIID FROM \
                         (SELECT POIID, ROW_NUMBER() OVER \
                            (PARTITION BY interestpoints.POILatitude, \
                            interestpoints.POILongitude ORDER BY POIID) duplicate \
                            FROM interestpoints) \
                            WHERE duplicate < 2);")

stats = df.groupBy('POIID').count()

poistats = pois.join(stats, pois.POIID == stats.POIID).select(pois["*"],stats["count"])

stats = df.groupBy('POIID').agg({'Distance': 'mean'})

poistats = poistats.join(stats, poistats.POIID == stats.POIID).select(poistats["*"],stats["avg(Distance)"])

stats = df.groupBy('POIID').agg({'Distance': 'stddev'})

poistats = poistats.join(stats, poistats.POIID == stats.POIID).select(poistats["*"],stats["stddev(Distance)"])

stats = df.groupBy('POIID').agg({'Distance': 'max'})

poistats = poistats.join(stats, poistats.POIID == stats.POIID).select(poistats["*"],stats["max(Distance)"])

def density(count, radius):
    return count / (math.pi*(radius**2.0))
udf_density = udf(density, FloatType())

poistats = poistats.withColumn("Density", udf_density(poistats["count"], poistats["max(Distance)"]))

poistats.show()

# draw circles on map
m = folium.Map(location=[57.4, -96.466667], \
               zoom_start=3)

for f in poistats.collect(): 
    folium.Circle(
    radius=f["max(Distance)"]*1000.0,
    location=[f.POILatitude, f.POILongitude],
    popup=f.POIID,
    color='crimson',
    fill=False,
    ).add_to(m)

m

+-----+-----------+------------+-----+------------------+-----------------+-------------+------------+
|POIID|POILatitude|POILongitude|count|     avg(Distance)| stddev(Distance)|max(Distance)|     Density|
+-----+-----------+------------+-----+------------------+-----------------+-------------+------------+
| POI4|   45.22483|  -63.232729|  454|238.88439764537446|224.9830158294344|    857.25476|1.9664648E-4|
| POI1|  53.546167| -113.485734| 9349| 294.7689992814782|287.9031053577069|    1689.5304|0.0010425173|
| POI3|  45.521629|  -73.566024| 9228|   451.77785672626|224.5055577157318|    1500.4532|0.0013047064|
+-----+-----------+------------+-----+------------------+-----------------+-------------+------------+



In [20]:
import folium

# manually drawing circles to test
# eventually want to take average
m = folium.Map(location=[48.0, -85.0], \
               zoom_start=3)
folium.Circle(
    radius=1689530.4,
    location=[53.546167, -113.485734],
    popup='POI1',
    color='crimson',
    fill=False,
).add_to(m)
folium.Circle(
    radius=1500453.2,
    location=[45.521629, -73.566024],
    popup='POI3',
    color='blue',
    fill=False,
).add_to(m)
folium.Circle(
    radius=857254.76,
    location=[45.22483, -63.232729],
    popup='POI4',
    color='green',
    fill=False,
).add_to(m)
m