# Lab 12: Predictive model with Random Forest

As always, we create a SparkContext/HiveContext.

In [None]:
# Set up Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *

SparkContext.setSystemProperty('spark.executor.memory', '2g')
conf = SparkConf()
conf.set('spark.executor.instances', 15)
conf.set('spark.sql.autoBroadcastJoinThreshold', 100*1024*1024)  # 100MB for broadcast join
sc = SparkContext('yarn-client', 'Spark-lab12', conf=conf)

from pyspark.sql import HiveContext
hc = HiveContext(sc)
hc.sql("use demo")

In [None]:
def eval_metrics(lap):
    tp = float(len(lap[(lap['label']==1) & (lap['prediction']==1)]))
    tn = float(len(lap[(lap['label']==0) & (lap['prediction']==0)]))
    fp = float(len(lap[(lap['label']==0) & (lap['prediction']==1)]))
    fn = float(len(lap[(lap['label']==1) & (lap['prediction']==0)]))
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    return {'precision': precision, 'recall': recall, 'accuracy': accuracy}

As with lab11:
1. Load the feature matrix created in lab 10 into a Spark dataframe called 'fm'
2. Split into two dataframes - train (2011-2013) and test (only 2014)

In [None]:
<YOUR CODE HERE>

First, create a pipline very similar to lab 11, only this time we use Random Forest instead of Logistic regression.
For parameters to the random forest, you can use: numTrees=100, maxDepth=4, maxBins=50

In [None]:
<YOUR CODE HERE>

Now modify this pipeline to also add up to 50 features corresponding to words in the "description" field. Use Tokenizer and HashingTF:

In [None]:
<YOUR CODE HERE>

Train the random forest and evaluate the results using the eval_metrics() function:

In [None]:
<YOUR CODE HERE>

In addition to the overall accuracy metrics, we now want to plot the accuacy of prediction per each neighborhood. In order to do this, we first use ESRI's HIVE UDFs: ST_X, ST_Y nad ST_Centroid to compute the longitude/latitude centroid of each neighborhood in San Francisco:

In [None]:
hc.sql("add jar /home/jupyter/notebooks/jars/guava-11.0.2.jar")
hc.sql("add jar /home/jupyter/notebooks/jars/esri-geometry-api.jar")
hc.sql("add jar /home/jupyter/notebooks/jars/spatial-sdk-hive.jar")
hc.sql("add jar /home/jupyter/notebooks/jars/spatial-sdk-json.jar")

hc.sql("create temporary function ST_Centroid as 'com.esri.hadoop.hive.ST_Centroid'")
hc.sql("create temporary function ST_X as 'com.esri.hadoop.hive.ST_X'")
hc.sql("create temporary function ST_Y as 'com.esri.hadoop.hive.ST_Y'")

df_centroid = hc.sql("""
SELECT neighborho as neighborhood, 
       ST_X(ST_Centroid(sf_neighborhoods.shape)) as cent_longitude,
       ST_Y(ST_Centroid(sf_neighborhoods.shape)) as cent_latitude
FROM sf_neighborhoods
""")
df_centroid.cache()

Now that we have the centroid for each neighborhood, we compute the accuracy of prediction specific to crimes within that neighborhood. 

Complete the code below to plot a map of san francisco, with markers in the centroid of each neighborhood showing that accuracy number.

In [None]:
import folium

from IPython.display import HTML
map_width=1000
map_height=600
sf_lat = 37.77
sf_long = -122.4

def inline_map(m, width=map_width, height=map_height):
    m.create_map()
    srcdoc = m.HTML.replace('"', '&quot;')
    embed = HTML('<iframe srcdoc="{}" '
                 'style="width: {}px; height: {}px; '
                 'border: none"></iframe>'.format(srcdoc, width, height))
    return embed


n_list = results.select("neighborhood").distinct().toPandas()['neighborhood'].tolist()

df = results.select("neighborhood", "label", "prediction").toPandas()
map_sf = folium.Map(location=[sf_lat, sf_long], zoom_start=12, width=map_width, height=map_height)
for n in df_centroid.collect():
    if n.neighborhood in n_list:
        m = eval_metrics(df[df['neighborhood']==n.neighborhood])
        map_sf.simple_marker(<YOUR CODE HERE>)

inline_map(map_sf) 