###### 0. Loading libraries

In [2]:
# -*- encode:utf-8 -*-
import re
import time
import requests
import string
from datetime import date
import os
from io import BytesIO
from pyspark.sql import SQLContext, Row
import pyspark.sql.functions as sqlFunction
from pyspark.sql.types import *
from fnmatch import fnmatch
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

###### 1. Reading park info file: read records from uploaded table

In [4]:
parkinfo_location = '/FileStore/tables/9kjsmmkl1478305508067/parksinfo4.csv'

park_df_features = ['null',
                 'park_id',
                 'html',
                 'name',
                 'city',
                 'state',
                 'Excellent',
                 'Very_good',
                 'Average',
                 'Poor',
                 'Terrible',
                 'Families',
                 'Couples',
                 'Friends',
                 'Solo',
                 'Business',
                 'Spring',
                 'Summer',
                 'Fall',
                 'Winter']
park_df_dtypes =[FloatType(),
                    FloatType(),
                    StringType(),
                    StringType(),
                    StringType(),
                    StringType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType(),
                    FloatType()]
park_df_schema = StructType([StructField(i, j, True) for i, j in zip(park_df_features, park_df_dtypes)])  
parks_df=sqlContext.read.format("com.databricks.spark.csv").options(header=True).schema(park_df_schema).load(parkinfo_location).drop('null')
parks_df.cache()
display(parks_df)

In [5]:
import urllib
ACCESS_KEY = ""
SECRET_KEY = ""
ENCODED_SECRET_KEY = urllib.quote(SECRET_KEY, "")
AWS_BUCKET_NAME = "parks101"
MOUNT_NAME = "parks_info_review"
try:
  dbutils.fs.mount("s3n://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
except:
  print 'AWS bucket already mounted.'

In [6]:
review_df_features= ['index',
                     'review_index',
                     'comment',
                     'date',
                     'park_id',
                     'stars',
                     'title',
                     'reviewer_level',
                     'reviewer']
review_df_dtypes = [IntegerType(),
                    IntegerType(),
                    StringType(),
                    StringType(), 
                    FloatType(), 
                    FloatType(),
                    StringType(),
                    FloatType(),
                    StringType()]
review_df_schema = StructType([StructField(i, j, True) for i, j in zip(review_df_features, review_df_dtypes)])
parks_review_df=(sqlContext.read
                 .format("com.databricks.spark.csv")
                 .schema(review_df_schema)
                 .options(header=True)
                 .load('dbfs:/mnt/parks_info_review/reviews_raw.csv')).drop('review_index')
display(parks_review_df)

##### 2. Data Cleaning

* Correct state name
* add state name abbreviation (for plotting in databricks)

###### 2.1 Data cleaning: finding alias of state names

In [9]:
states = ['Mississippi', 'Oklahoma', 'Delaware', 'Minnesota', 'Illinois', 'Arkansas', 'New Mexico', 'Indiana', 'Louisiana', 'Texas', 'Wisconsin', 'Kansas', 'Connecticut', 'California', 'West Virginia', 'Georgia', 'North Dakota', 'Pennsylvania', 'Alaska', 'Missouri', 'South Dakota', 'Colorado', 'New Jersey', 'Washington', 'New York', 'Nevada', 'Washington DC', 'Maryland', 'Idaho', 'Wyoming', 'Arizona', 'Iowa', 'Michigan', 'Utah', 'Virginia', 'Oregon', 'Montana', 'New Hampshire', 'Massachusetts', 'South Carolina', 'Vermont', 'Florida', 'Hawaii', 'Kentucky', 'Rhode Island', 'Nebraska', 'Ohio', 'Alabama', 'North Carolina', 'Tennessee', 'Maine']
wrong_state_name=parks_df.select('city', 'state').filter(~ parks_df['state'].isin(states)).distinct()
display(wrong_state_name)

###### 2.2a Data cleaning: stardardize state names

In [11]:
def state_correct(city, state_name):
  corrected_name = state_name
  wrongspell = {'Columbia': 'Washington DC',
                'Ha': 'Hawaii',
                'Ma': 'Maine',
                'Missi': 'Mississippi',
                'C': 'California',
                'Califor': 'California',
                'Cali': 'California',
                'Calif': 'California',
                'Califo': 'California',
                'Califor': 'California',
                'Wy': 'Wyoming',
                'Reg': 'Pennsylvania',
                'Nationa': 'Alaska',
                'Georgi': 'Georgia',
                'Carolin': 'Carolina',
                'Fran': 'California'}
  if corrected_name in wrongspell:
    corrected_name = wrongspell[corrected_name]
  prefix = city.split('_')[-1]
  if prefix in ['South', 'North', 'West', 'New', "Rhode"]:
    corrected_name = prefix + ' ' + corrected_name
  return corrected_name
  
state_correct_udf=udf(state_correct, StringType())

###### 2.2b Data cleaning: replace'_' with ' ' for park names

In [13]:
def park_name(pname):
  return re.sub('_',' ',pname)
park_name_udf = udf(park_name,StringType())

In [14]:
parks_df = parks_df.withColumn('total_visit',(parks_df['Spring']+parks_df['Summer']+parks_df['Fall']+parks_df['Winter']))
parks_df = parks_df.withColumn('state', state_correct_udf('city', 'state'))
parks_df_clean=parks_df.withColumn('name',park_name_udf('name'))


In [15]:
display(parks_df_clean.select('name','state','total_visit').sort('total_visit',ascending=False))

The most popular parks are in or around big cities.

###### 2.3 Data cleaning: create state name abbrevition column for geometric display
databricks display option map only takes state name abbreviation, so conversion from state name to its abbreviation is needed.

In [18]:
def state_abbrev(state_name):
  state_table={'Alabama':'AL',
                'Alaska':'AK',
                'Arizona':'AZ',
                'Arkansas':'AR',
                'California':'CA',
                'Colorado':	'CO',
                'Connecticut':'CT',
                'Delaware':'DE',
                'Florida':'FL',
                'Georgia':'GA',
                'Hawaii':'HI',
                'Idaho':'ID',
                'Illinois':'IL',
                'Indiana':'IN',
                'Iowa':'IA',
                'Kansas':'KS',
                'Kentucky':'KY',
                'Louisiana':'LA',
                'Maine':'ME',
                'Maryland':'MD',
                'Massachusetts':'MA',
                'Michigan':'MI',
                'Minnesota':'MN', 
                'Mississippi':'MS',
                'Missouri':'MO',
                'Montana':'MT',
                'Nebraska':'NE',
                'Nevada':'NV',
                'New Hampshire':'NH',
                'New Jersey':'NJ',
                'New Mexico':'NM',
                'New York':'NY',
                'North Carolina':'NC',
                'North Dakota':'ND',
                'Ohio':'OH',
                'Oklahoma':'OK',
                'Oregon':'OR',
                'Pennsylvania':'PA',
                'Rhode Island':'RI',
                'South Carolina':'SC',
                'South Dakota':'SD',
                'Tennessee':'TN',
                'Texas':'TX',
                'Utah':'UT',
                'Vermont':'VT',
                'Virginia':'VA',
                'Washington':'WA',
                'Washington DC': 'DC',
                'West Virginia':'WV',
                'Wisconsin':'WI',
                'Wyoming':'WY'}
  try:
    s_name=state_table[state_name]
  except KeyError:
    s_name=state_name
  return s_name
state_abbrev_udf=udf(state_abbrev,StringType())
parks_df_abb=parks_df_clean.withColumn('state_abb',state_abbrev_udf(parks_df_clean['state']))
parks_df_abb.cache()
display(parks_df_abb)


In [19]:
def float2int(num):
  if isinstance(num, float):
    return int(num)
  else:
    return num

def str2date(onestr):
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    try:
      segs = re.search('(January|February|March|April|May|June|July|August|September|October|November|December).*?(\d{1,2}).*?(\d{4})', onestr)
      month, day, year = segs.group(1), segs.group(2), segs.group(3)
      qdate = date(int(year), months.index(month) + 1, int(day))
    except:
      return None
    return qdate
  
def wordvec(onestr):
  newstr = re.sub('[^a-z0-9 ]', '', onestr.lower()).split()
  return newstr
  
float2intudf = udf(float2int, IntegerType())
str2dateudf = udf(str2date, DateType())
wordvecudf = udf(wordvec, ArrayType(StringType()))


review_df = parks_review_df.select(#float2intudf('index').alias('index'),
                                               float2intudf('park_id').alias('park_id'), 
                                               #'name',
                                               #'state_abb',
                                               #float2intudf('review_index').alias('review_index'),
                                               'reviewer',
                                               #float2intudf('reviewer_level').alias('reviewer_level'),
                                               str2dateudf('date').alias('date'),
                                               #wordvecudf('title').alias('title'),
                                               #wordvecudf('comment').alias('comment'),
                                               'title',
                                               'comment',
                                               'stars').distinct().cache()
display(review_df)

###### 2.4 Save cleaned park info data to HDFS

In [21]:
save_switch = False
if save_switch:
  parks_df_abb.write.format("com.databricks.spark.csv").option('header','true').mode('overwrite').save('/Wei data/parks_info_clean.csv')
display(dbutils.fs.ls('Wei data/parks_info_clean.csv'))

In [22]:
dbutils.fs.ls('/Wei data/')

In [23]:
from boto.s3.connection import S3Connection
from boto.s3.key import Key

ACCESS_KEY = ""
SECRET_KEY = ""
BUCKET_NAME = 'parks101'
file_name = 'park_info_clean.csv'
s3_handle = S3Connection(ACCESS_KEY, SECRET_KEY)
bucket_handle = s3_handle.get_bucket(BUCKET_NAME)
new_file_handle = bucket_handle.new_key(file_name)
new_file_handle.set_contents_from_string(parks_df_abb.toPandas().to_csv())

In [24]:
for i in bucket_handle.list():
  print i.name

In [25]:
def read_in_clean_df(filepath):
  park_df_features = ['null',
                   'park_id',
                   'html',
                   'name',
                   'city',
                   'state',
                   'Excellent',
                   'Very_good',
                   'Average',
                   'Poor',
                   'Terrible',
                   'Families',
                   'Couples',
                   'Friends',
                   'Solo',
                   'Business',
                   'Spring',
                   'Summer',
                   'Fall',
                   'Winter',
                   'total_visit',
                   'state_abb']
  park_df_dtypes =[FloatType(),
                      FloatType(),
                      StringType(),
                      StringType(),
                      StringType(),
                      StringType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      FloatType(),
                      StringType()]
  park_df_schema = StructType([StructField(i, j, True) for i, j in zip(park_df_features, park_df_dtypes)])  
  df=sqlContext.read.format("com.databricks.spark.csv").options(header=True).schema(park_df_schema).load(filepath)
  df.cache()
  return df

switch = False
if switch:
  parkinfo_location = '/Wei data/parks_info_clean.csv'
  parks_df_abb = read_in_clean_df(parinfo_location)

###### 3.1 Exploration: Parks and visitings per state count

In [27]:
# this is to remove 'sum()' from 'sum(col_name)' in an aggregated dataframe
def recolumn_grouped_df(df):
  cols = df.columns
  index=[]
  for i, col in enumerate(cols):
    try:
      index.append(re.search('\((.*?)\)', col).group(1))
    except AttributeError:
      index.append(col)
  return df.toDF(*index)

###### 3.1.1 Group by park

In [29]:
visit_threshold = 0
parks_sum_df=recolumn_grouped_df(parks_df_abb.filter(parks_df_abb['total_visit']>visit_threshold).groupBy('state_abb').sum()).cache()
display(parks_sum_df)

###### 3.1.1 First count the visits in each state

In [31]:
display(parks_sum_df.select('state_abb', 'total_visit'))

The most attractive states are California, Florida, and New York.

###### 3.1.2 Result dominated by recreation sites. Filter only national parks

In [34]:
def isnationalpark(park_name):
  return 'National Park' in park_name
isnationalpark_udf = udf(isnationalpark, BooleanType())

In [35]:
nationalparks_df = parks_df_abb.filter(isnationalpark_udf('name')).cache()
display(nationalparks_df)

In [36]:
visit_threshold = 0
nationalparks_sum_df=recolumn_grouped_df(nationalparks_df.filter(nationalparks_df['total_visit']>visit_threshold).groupBy('state_abb').sum()).cache()
display(nationalparks_sum_df.select('state_abb','total_visit'))

Arizona, Utah, and Florida has the top three visitors for national park. Too bad, many states don't have national parks.

###### 3.1.3 How about favorate ratio

In [39]:
display(nationalparks_sum_df.selectExpr('state_abb', '(Excellent + Very_good) / total_visit'))

###### 3.1.4 Then how about disfavorate ratio

In [41]:
display(nationalparks_sum_df.selectExpr('state_abb', '(Poor + Terrible) / total_visit'))

###### 3.1.5 Break into seasons

In [43]:
display(nationalparks_sum_df.selectExpr('state_abb', 'Spring / total_visit'))

In [44]:
display(nationalparks_sum_df.selectExpr('state_abb', 'summer / total_visit'))

In [45]:
display(nationalparks_sum_df.selectExpr('state_abb', 'Fall / total_visit'))

In [46]:
display(nationalparks_sum_df.selectExpr('state_abb', 'Winter / total_visit'))

###### 3.1.6 Seems for winter season there are a few choices

In [48]:
winter_states = ['TX', 'FL', 'CA', 'SC', 'VA', 'HA', 'AZ', 'OH']
winter_nationalparks_sum_df = (nationalparks_df.drop('null')
                                              .filter(nationalparks_df['state_abb'].isin(winter_states))
                                              .groupBy(['park_id','name','state_abb'])
                                              .sum().drop('sum(park_id)'))
winter_nationalparks_df = recolumn_grouped_df(winter_nationalparks_sum_df).sort('total_visit',ascending=False)                                           
display(winter_nationalparks_df)

In [49]:
winter_nationalparks_df.take(5)

In [50]:
winter_nationalparks_df.columns

###### 3.1.7 Visualize the results
* Approval score
* Type of visit

In [52]:
winterpark_data = winter_nationalparks_df.collect()
winterpark_dic = dict()
for c in winter_nationalparks_df.columns:
  winterpark_dic[c] = [row[c] for row in winterpark_data]  
visit_types = ['Families',	'Couples',	'Friends',	'Solo',	'Business']
rating_levels = ['Excellent',	'Very_good',	'Average',	'Poor',	'Terrible']
park_names = winterpark_dic['name']
colors = ['r', 'g', 'b', 'c', 'm']
num_parks = len(park_names)
num_visit_types = len(visit_types)
num_rating_levels = len(rating_levels)

fig = plt.figure(figsize=(12,9))
width = 1.4

ax = plt.subplot(211)
ax.set_aspect(150)
ax.set_xlim([0, 10000])
ypos = np.arange(num_parks) * 2
plots = []
for i, vtype in enumerate(visit_types):
  xpos = winterpark_dic[vtype]
  #datalen = len(ypos)
  if not i: #i=0
    p = plt.barh(ypos, xpos, width, color = colors[i])
    plots.append(p)
    xsum = xpos[:]
  else:
    p = plt.barh(ypos, xpos, width, color = colors[i], left = xsum)
    plots.append(p)
    xsum = [xsum[i] + xpos[i] for i in range(num_parks)]
plt.legend([p[0] for p in plots], visit_types, prop={'size':8}, bbox_to_anchor=(0.9, 0.4))
plt.yticks(ypos +width/2., park_names, rotation=0, size=8)
plt.xlabel('visit counts')


ax = plt.subplot(212)
ax.set_aspect(150)
ax.set_xlim([0, 10000])
ypos = np.arange(num_parks) * 2
plots = []
for i, level in enumerate(rating_levels):
  xpos = winterpark_dic[level]
  if not i:
    p = plt.barh(ypos, xpos, width, color = colors[i])
    plots.append(p)
    xsum = xpos[:]
  else:
    p = plt.barh(ypos, xpos, width, color = colors[i], left = xsum)
    plots.append(p)
    xsum = [xsum[i] + xpos[i] for i in range(num_parks)]
plt.legend([p[0] for p in plots], rating_levels, prop={'size':8}, bbox_to_anchor=(0.9, 0.4))
plt.yticks(ypos +width/2., park_names, rotation=0, size=8)
plt.xlabel('visit counts')
display(fig)

###### 3.2 Free memories

In [54]:
parks_df.unpersist()
parks_sum_df.unpersist()
nationalparks_df.unpersist()
winter_nationalparks_df.unpersist()

In [55]:
number_reviewers = review_df.select('reviewer').distinct().count()
print "There are %d reviewers." %number_reviewers


In [56]:
total_reviews=review_df.count()
print "The total reviews are %d." %total_reviews

In [57]:
park_review_count = review_df.groupby('park_id').count().select('park_id','count').sort('count',ascending=False)
display(park_review_count)

In [58]:
review_name_df=review_df.filter(review_df['reviewer']!='unknown').select('park_id','reviewer','stars')

In [59]:
grouped_reviewer_df = review_name_df.groupby('reviewer').count().select('reviewer','count')
reviewer_count_df = grouped_reviewer_df.select('reviewer',grouped_reviewer_df['count'].alias('num of reviews')).sort('num of reviews',ascending=False)
display(reviewer_count_df.head(5))

In [60]:
average_reviews_per_id= review_name_df.groupby('reviewer').count().selectExpr('mean(count)')
print average_reviews_per_id.collect()[0][0]

In [61]:
from pyspark.sql.functions import monotonicallyIncreasingId
parks_review_df = review_name_df.withColumn("user_id", monotonicallyIncreasingId()+1).select('user_id','park_id','stars')
display(parks_review_df)

In [62]:
seed = 0
(training_df,validation_df,test_df)=parks_review_df.randomSplit([0.6,0.2,0.2],seed=seed)
training_df.cache()
validation_df.cache()
test_df.cache()

In [63]:
from pyspark.ml.recommendation import ALS
als=ALS()
als.setMaxIter(6)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setUserCol('user_id')\
   .setItemCol('park_id')\
   .setRatingCol('stars')
from pyspark.ml.evaluation import RegressionEvaluator
reg_eval = RegressionEvaluator(predictionCol='prediction',labelCol='stars',metricName='rmse')
tolerance = 0.03
ranks=[4,8,12]
errors=[0,0,0]
models=[0,0,0]
err=0
min_error=float('inf')
best_rank=-1
for rank in ranks:
  als.setRank(rank)
  model=als.fit(training_df)
  predict_df=model.transform(validation_df)
  predicted_star_df=predict_df.filter(predict_df.prediction != float('nan'))
  error= reg_eval.evaluate(predicted_star_df)
  errors[err]=error
  models[err]=model
  print 'For rank %s the RMSE is %s' %(rank, error)
  if error < min_error:
    min_error = error
    best_rank = err
  err +=1
als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' %ranks[best_rank]
best_model = models[best_rank]

'''predict test df'''
predict_df=best_model.transform(test_df)
predicted_test_df=predict_df.filter(predict_df.prediction != float('nan'))
test_RMSE = reg_eval.evaluate(predicted_test_df)
print test_RMSE

In [64]:
display(parks_df_abb.select('park_id','name','state'))

In [65]:
'''predict for yourself'''
from pyspark.sql import Row
my_user_id=0
my_rated_parks =[(my_user_id,5,4.5 ),(my_user_id,8,5.0),(my_user_id,17,3.0),
                (my_user_id,10,5.0),(my_user_id,39,5.0),(my_user_id,53,2.0),(my_user_id,776,1.0),(my_user_id,1043,2.0),(my_user_id,158,0.5),(my_user_id,909,4.8)] #Balboa Park, Grand Canyon National Park,Mammoth Cave National Park,Acadia National Park,Magic Kindom,Independence Visitor Center,The National WWII Museum,Martin Luther King Jr National Historic Site

my_parks_df = sqlContext.createDataFrame(my_rated_parks,['user_id','park_id','stars'])
training_with_my_ratings_df = training_df.unionAll(my_parks_df)

In [66]:
import pyspark.sql.functions as F
als.setPredictionCol('prediction')\
   .setMaxIter(6)\
   .setSeed(seed)\
   .setUserCol('user_id').setItemCol('park_id').setRatingCol('stars').setRank(ranks[best_rank])
my_ratings_model = als.fit(training_with_my_ratings_df)
my_rated_park_id=[x[1] for x in my_rated_parks]
not_rated_df = parks_df_abb.filter(~parks_df_abb['park_id'].isin(my_rated_park_id))
my_not_rated_df=not_rated_df.withColumn('user_id',F.lit(my_user_id))
raw_predicted_ratings_df = my_ratings_model.transform(my_not_rated_df)
predicted_ratings_df = raw_predicted_ratings_df.filter(raw_predicted_ratings_df['prediction']!=float('nan')) 

In [67]:
recommend_prediction_df=predicted_ratings_df.filter(predicted_ratings_df['prediction']>4.2).sort('Winter','Families',ascending=False)
display(recommend_prediction_df.select('name','state','Winter','Families','prediction'))

In [68]:
not_recommend_prediction_df=predicted_ratings_df.filter(predicted_ratings_df['prediction']<2.0).sort('Winter','Families',ascending=False)
display(not_recommend_prediction_df.select('park_id','name','state','Winter','Families','prediction'))