In [1]:
DF = sqlContext.read.format('com.databricks.spark.csv').options(delimiter=',', header='true', inferschema='true').load("dbfs:/mnt/s3/data/train_v2_flatten.csv")
DF = DF.cache()

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

# User defined functions for converting String to valid input type and handling null type, invalid data.

def deleteNull_castToFloat(x):
  
  if x == None:
    return 0.0
  if x == '(not set)':
    return 0
  if x == 'No':
    return 0.0
  else:
    y = len(x)
    if y>6:
      i = int(x[ : y-6])
      f = int(x[y-6:y])
    else:
      i = 0
      f = int(x)
    a = float(i)
    b = float(f)/1000000
    return a+b
udf_deleteNull_castToFloat = F.udf(deleteNull_castToFloat, T.FloatType())

def deleteNull_castToInt(x):
  
  if x == None:
    return 0
  if x == '(not set)':
    return 0
  if x == 'No':
    return 0
  else:
    return int(x)
udf_deleteNull_castToInt = F.udf(deleteNull_castToInt, T.IntegerType())

def process_timeOnSite(x):
  if x == None:
    return 0
  elif x.isdigit():
    return int(x)
  else:
    return 0
udf_process_timeOnSite = F.udf(process_timeOnSite, T.IntegerType())

def process_totalTR(x):
  if x == None:
    return 0.0
  else:
    return (x/1000000.0)
udf_process_totalTR = F.udf(process_totalTR, T.FloatType())

def deleteNull_castToInt_visitNumber(x):
  
  if x == None:
    return 0
  if x == '(not set)':
    return 0
  if x == 'No':
    return 0
  if len(x)>10:
    return 0
  else:
    return int(x)
udf_deleteNull_castToInt_visitNumber = F.udf(deleteNull_castToInt_visitNumber, T.IntegerType())

In [3]:
from pyspark.sql.functions import col
DF_select = DF.select(udf_deleteNull_castToFloat("totals_transactionRevenue").alias("revenue"), udf_process_totalTR("totals_totalTransactionRevenue").alias("totalTR"), "device_operatingSystem", "device_browser",  "geoNetwork_country", "channelGrouping", (col('visitStartTime')).cast("string").alias("startTime_cast"), (col('date')).cast("string").alias("date_cast"), "fullVisitorId", udf_deleteNull_castToInt("totals_hits").alias("hits"), udf_deleteNull_castToInt("totals_pageviews").alias("pageview"), udf_deleteNull_castToInt_visitNumber("visitNumber").alias("visitNumber"), udf_process_timeOnSite("totals_timeOnSite").alias("timeOnSite"), udf_process_timeOnSite("totals_transactions").alias("totals_transactions"))

In [4]:
DF_select = DF_select.cache()

In [5]:
# Extract date information
from pyspark.sql.functions import unix_timestamp, to_date
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import month, dayofweek, hour, year, weekofyear, dayofyear

DF_select = DF_select.withColumn('date_cast', to_date('date_cast', 'yyyyMMdd'))
DF_select = DF_select.withColumn("yearweek", weekofyear("date_cast"))
DF_select = DF_select.withColumn("yearday", dayofyear("date_cast"))

DF_fea = DF_select.cache()

In [6]:
# process geoNetwork_country
import numpy as np
country_5k_10k = DF_select.groupBy("geoNetwork_country").count().filter("count>5000").filter("count<10000").select('geoNetwork_country')
country_0_5k = DF_select.groupBy("geoNetwork_country").count().filter("count<=5000").select('geoNetwork_country')
country_5k_10k = np.array(country_5k_10k.toPandas().geoNetwork_country)
country_0_5k = np.array(country_0_5k.toPandas().geoNetwork_country)

def process_country(x):
  if x == None:
    return 'Other'
  elif x == '(not set)':
    return 'Other'
  elif x in country_0_5k:
    return 'Other_0_5k'
  elif x in country_5k_10k:
    return 'Other_5k_10k'
  else:
    return x
  
  
udf_process_country = F.udf(process_country, T.StringType())
DF_fea = DF_fea.withColumn('process_country', udf_process_country('geoNetwork_country'))
DF_fea = DF_fea.cache()

In [7]:
# Generate combined features with yearday, yearweek and hits

DF_yearday = DF_fea.groupby('yearday').sum('hits').withColumnRenamed('sum(hits)', 'sum_hits_per_day').withColumnRenamed("yearday", 'yearday2')
DF_fea = DF_fea.join(DF_yearday, DF_yearday.yearday2 == DF_fea.yearday).drop('yearday2')

DF_yearweek = DF_fea.groupby('yearweek').sum('hits').withColumnRenamed('sum(hits)', 'sum_hits_per_week').withColumnRenamed("yearweek", 'yearweek2')
DF_fea = DF_fea.join(DF_yearweek, DF_yearweek.yearweek2 == DF_fea.yearweek).drop('yearweek2')

DF_fea = DF_fea.cache()

In [8]:
# Generate combined features with process_country and hits, pageview
DF_fea = DF_fea.join(DF_fea.groupby('process_country').sum('pageview').withColumnRenamed('sum(pageview)', 'sum_pageviews_per_country'), "process_country")

DF_fea = DF_fea.join(DF_fea.groupby('process_country').sum('hits').withColumnRenamed('sum(hits)', 'sum_hits_per_country'), "process_country")

DF_fea = DF_fea.cache()

In [9]:
#Generating features grouped by fullVisitorId

DF_fea = DF_fea.join(DF_fea.groupby('fullVisitorId').sum('pageview').withColumnRenamed('sum(pageview)', 'sum_pageviews_per_fullVisitorId'), "fullVisitorId")

DF_fea = DF_fea.join(DF_fea.groupby('fullVisitorId').sum('hits').withColumnRenamed('sum(hits)', 'sum_hits_per_fullVisitorId'), "fullVisitorId")

DF_fea = DF_fea.join(DF_fea.groupby('fullVisitorId').sum('totals_transactions').withColumnRenamed('sum(totals_transactions)', 'sum_transactions_per_fullVisitorId'), "fullVisitorId")

DF_fea = DF_fea.cache()

In [10]:
# Encode categorical features
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator
all_col = ['revenue', 'totalTR', 'process_country', 'sum_hits_per_day', 'hits', 'pageview', 'timeOnSite', 'visitNumber', 'sum_pageviews_per_country', 'sum_hits_per_country', 'sum_pageviews_per_fullVisitorId', 'sum_hits_per_fullVisitorId', 'sum_transactions_per_fullVisitorId']
categorical_features = ['process_country']
conti_features = ['revenue', 'totalTR', 'sum_hits_per_day', 'hits', 'pageview', 'timeOnSite', 'visitNumber', 'sum_pageviews_per_country', 'sum_hits_per_country', 'sum_pageviews_per_fullVisitorId', 'sum_hits_per_fullVisitorId', 'sum_transactions_per_fullVisitorId']
stages = [] # stages in our Pipeline
# One-hot encode cotegorical feature
for i in categorical_features:
  stringIndexer = StringIndexer(inputCol=i, outputCol=i + "_Index").setHandleInvalid('skip')
  #StringIndexer.handleInvalid('skip')
  encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[i + "_Vec"])
  stages += [stringIndexer, encoder]

In [11]:
# Assemble all inputs together
assemblerInputs = [i + "_Vec" for c in categorical_features]+conti_features
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [12]:
DF_train = DF_fea.select('revenue', 'totalTR', 'process_country', 'sum_hits_per_day', 'hits', 'pageview', 'timeOnSite', 'visitNumber', 'sum_pageviews_per_country', 'sum_hits_per_country', 'sum_pageviews_per_fullVisitorId', 'sum_hits_per_fullVisitorId', 'sum_transactions_per_fullVisitorId')
DF_train = DF_train.cache()
#display(DF_train)

In [13]:
# Apply log function to revenue
from pyspark.sql.functions import log1p as log
DF_train = DF_train.withColumn("revenue", log("revenue"))
DF_train = DF_train.cache()
display(DF_train)

revenue,totalTR,process_country,sum_hits_per_day,hits,pageview,timeOnSite,visitNumber,sum_pageviews_per_country,sum_hits_per_country,sum_pageviews_per_fullVisitorId,sum_hits_per_fullVisitorId,sum_transactions_per_fullVisitorId
0.0,0.0,Singapore,41818,1,1,0,1,44091,52571,1,1,0
0.0,0.0,France,25007,3,3,50,1,91686,106656,3,3,0
0.0,0.0,United States,11192,8,8,139,1,3500525,4267861,8,8,0
0.0,0.0,United States,28033,5,4,184,1,3500525,4267861,4,5,0
0.0,0.0,United States,25620,1,1,0,1,3500525,4267861,2,2,0
0.0,0.0,United States,29005,1,1,0,2,3500525,4267861,2,2,0
0.0,0.0,Taiwan,16201,37,27,613,1,98659,116957,27,37,0
0.0,0.0,Indonesia,18310,1,1,0,1,38101,43647,1,1,0
0.0,0.0,United Kingdom,22389,2,2,12,1,175051,198757,2,2,0
0.0,0.0,United States,24600,1,1,0,1,3500525,4267861,1,1,0


In [14]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(DF_train)
model = pipelineModel.transform(DF_train)

In [15]:
from pyspark.ml.linalg import DenseVector
input_data = model.rdd.map(lambda x: (x["revenue"], DenseVector(x["features"])))

In [16]:
train = sqlContext.createDataFrame(input_data, ["revenue", "features"])		

In [17]:
# Build model
from pyspark.ml.regression import RandomForestRegressor as RF
rf = RF(featuresCol = "features", labelCol = "revenue", maxBins = 32, maxDepth = 6, numTrees = 5)
# Split the whole input dataset to training and testing sets
train_data, test_data = train.randomSplit([.8,.2],seed=1234)
rfModel = rf.fit(train_data)

In [18]:
# Make prediction
predictions = rfModel.transform(test_data)
display(predictions.filter("revenue > 0").select("revenue", "prediction"))

revenue,prediction
2.453587976370721,2.798484037706248
2.995232137062492,2.991685719303602
3.0051874587763363,3.193659514710796
3.4867632793301064,3.499943261335049
3.769767933219301,4.536307653108648
4.077367966468396,4.354633684242906
4.233381559978863,4.227563937227666
4.572647025808955,4.227563937227666
5.525452939131784,4.384447771977756
1.3056264735593353,1.467375782356291


In [19]:
# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="prediction", labelCol="revenue", metricName="rmse")

# Run the evaluator on the DataFrame
rmse = regEval.evaluate(predictions)
rmse

In [20]:
rfModel.featureImportances