In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
import re
# Load PySpark
spark = SparkSession.builder.appName('Analysis').getOrCreate()
sc = pyspark.SparkContext.getOrCreate()

Processing & Analytical goals:
------------

In [2]:
# Parsing the raw log file into a RDD
regx = r"^(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) \S+ (.*) (\S+) (\S+)$"
rdd = sc.textFile("data/2015_07_22_mktplace_shop_web_log_sample.log.gz").map(lambda x: re.split(regx, x)[1:17])
rdd_ = rdd.map(lambda x: (x[0], x[1], x[2], x[3], float(x[4]), float(x[5]), float(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), x[11].strip('"'), x[12].lower(), x[13].strip('"'), x[14], x[15]))

In [3]:
# Schema for parsed log
Schema = T.StructType([T.StructField("timestamp", T.StringType(), True),
                                    T.StructField("elb", T.StringType(), True),
                                    T.StructField("client_port", T.StringType(), True),
                                    T.StructField("backend_port", T.StringType(), True),
                                    T.StructField("request_processing_time", T.DoubleType(), True),
                                    T.StructField("backend_processing_time", T.DoubleType(), True),
                                    T.StructField("response_processing_time", T.DoubleType(), True),
                                    T.StructField("elb_status_code", T.LongType(), True),
                                    T.StructField("backend_status_code", T.LongType(), True),
                                    T.StructField("received_bytes", T.LongType(), True),
                                    T.StructField("sent_bytes", T.LongType(), True),
                                    T.StructField("request_type", T.StringType(), True), #GET, POST etc.
                                    T.StructField("request", T.StringType(), True),
                                    T.StructField("user_agent", T.StringType(), True),
                                    T.StructField("ssl_cipher", T.StringType(), True),
                                    T.StructField("ssl_protocol", T.StringType(), True)])

In [4]:
# df = spark.read.csv("data/2015_07_22_mktplace_shop_web_log_sample.log.gz", Schema, sep = " ", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True).cache()

# RDD converted to DataFame
df = spark.createDataFrame(rdd_, schema=Schema).withColumn("client_ip", F.split(F.col("client_port"), ':')[0])\
.withColumn("unix_timestamp", F.unix_timestamp(F.col("timestamp").substr(0,19).cast('timestamp')))\
.repartition(rdd_.getNumPartitions(), "client_ip").cache()

In [5]:
# Printing the summary for the dataframe
df.describe().show(100, False)

+-------+---------------------------+----------------+------------------+------------+-----------------------+-----------------------+------------------------+------------------+-------------------+------------------+-----------------+------------+-----------------------------------------+---------------------------------------------------------------------------------------------------+-----------------------+------------+------------+--------------------+
|summary|timestamp                  |elb             |client_port       |backend_port|request_processing_time|backend_processing_time|response_processing_time|elb_status_code   |backend_status_code|received_bytes    |sent_bytes       |request_type|request                                  |user_agent                                                                                         |ssl_cipher             |ssl_protocol|client_ip   |unix_timestamp      |
+-------+---------------------------+----------------+------------------+---

1) Sessionize the web log by IP. Sessionize = aggregrate all page hits by visitor/IP during a session.
---------
https://en.wikipedia.org/wiki/Session_(web_analytics)

In [6]:
# UDF to assign session_id to each request.
# IP represents individual User
# Each user session can be of maximum 15 minutes

session_schema = T.ArrayType(T.StructType([T.StructField("client_port", T.StringType(), False),
                                           T.StructField("timestamp", T.StringType(), False),
                                           T.StructField("session_id", T.IntegerType(), False)]))

# UDF to assign session_id to each record
def sessionize(row):
    unix_timestamp = list(list(zip(*row))[0])
    timestamp = list(list(zip(*row))[1])
    client_port = list(list(zip(*row))[2])
    timeframe = 15*60 # Time Frame 900 Seconds
    first_element = unix_timestamp[0]
    current_sess_id = 1
    session_id = [current_sess_id]
    for i in unix_timestamp[1:]:
        if (i - first_element) < timeframe:
            session_id.append(current_sess_id)
        else:
            first_element = i
            current_sess_id += 1
            session_id.append(current_sess_id)
    return zip(client_port, timestamp, session_id)

udf_s = F.udf(lambda x, y, z: sessionize(sorted(zip(z,y,x))), session_schema)

In [7]:
df_ = df.groupby('client_ip').agg(udf_s(F.collect_list("client_port"), F.collect_list("timestamp"), F.collect_list("unix_timestamp")).alias('session'))\
.select("client_ip", F.explode("session").alias("session"))\
.select("client_ip", F.col("session").client_port.alias("client_port"), F.col("session").timestamp.alias("timestamp"), F.col("session").session_id.alias("session_id"))

In [8]:
df_.select("client_ip", "client_port", "timestamp", "session_id").show(10, False)

+------------+------------------+---------------------------+----------+
|client_ip   |client_port       |timestamp                  |session_id|
+------------+------------------+---------------------------+----------+
|1.186.101.79|1.186.101.79:50614|2015-07-22T10:45:55.881199Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:45:55.885488Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:46:27.839734Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:46:56.591943Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:01.782695Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:06.893987Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:07.616869Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:07.844446Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:18.072370Z|1         |
|1.186.101.79|1.186.101.79:50613|2015-07-22T10:47:28.084661Z|1         |
+------------+------------------+------------------

In [9]:
# Join
df_1 = df.join(df_, ['client_port', 'timestamp'], 'inner').drop(df_.timestamp).drop(df_.client_ip).drop(df_.client_port).cache()

2) Determine the average session time
----------

In [14]:
# # Determine the average session time
# def get_avg_session_time(df):
#     df_ = df.groupby("client_ip", "session_id").agg(((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp")) + 1)/60).alias("session_length"))
#     avg_session_time = df_.agg(F.avg("session_length").alias("avg_session_time")).collect()[0]["avg_session_time"]
#     return avg_session_time

# # Find the most engaged users, ie the IPs with the longest session times
# def get_most_engaged_user(df):
#     df_ = df.groupby("client_ip", "session_id").agg(F.min("timestamp").alias("from_timestamp"), F.max("timestamp").alias("to_timestamp"), ((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp")) + 1)/60).alias("session_length"))
#     return df_.orderBy("session_length", ascending=False)

# # Determine unique URL visits per session. To clarify, count a hit to a unique URL only once per session.
# def get_unique_url_request(df):
#     return df.groupby("client_ip", "session_id").agg(F.countDistinct("request").alias("unique_url_request"))

def analyze(df):
    df_ = df.groupby("client_ip", "session_id").agg(F.min("timestamp").alias("from_timestamp"), F.max("timestamp").alias("to_timestamp"), ((F.max("unix_timestamp") - F.min("unix_timestamp") + 1)).alias("session_length"), F.countDistinct("request").alias("unique_url_request"))
    return df_
    

In [15]:
df_2 = analyze(df_1.select("client_ip", "session_id", "timestamp", "unix_timestamp", "request")).cache()

In [16]:
df_2.show(10, False)

+-------------+----------+---------------------------+---------------------------+--------------+------------------+
|client_ip    |session_id|from_timestamp             |to_timestamp               |session_length|unique_url_request|
+-------------+----------+---------------------------+---------------------------+--------------+------------------+
|1.186.37.28  |1         |2015-07-22T10:31:24.915677Z|2015-07-22T10:34:51.085597Z|208           |52                |
|1.187.203.5  |1         |2015-07-22T16:41:16.877706Z|2015-07-22T16:42:40.378450Z|85            |4                 |
|1.187.205.138|1         |2015-07-22T17:42:16.189614Z|2015-07-22T17:42:29.109317Z|14            |3                 |
|1.187.247.221|1         |2015-07-22T10:48:03.596780Z|2015-07-22T10:49:50.318141Z|108           |4                 |
|1.187.249.242|1         |2015-07-22T16:22:01.577228Z|2015-07-22T16:22:28.014105Z|28            |10                |
|1.22.120.162 |1         |2015-07-22T16:44:24.902959Z|2015-07-22

In [18]:
print("Number of Distinct Sessions = {}".format(df_2.count()))
print("Average Session Time (Seconds) = {}".format(df_2.agg(F.avg("session_length").alias("avg_session_time")).collect()[0]["avg_session_time"]))


Number of Distinct Sessions = 113382
Average Session Time (Seconds) = 90.36611631475895


3) Determine unique URL visits per session. To clarify, count a hit to a unique URL only once per session.
----------------

In [19]:
# df_2.unique_url_request contains the number of unique URL request made by a client during a session.
print("20 Clients/IP sessions with most number of unique URL requests")
df_2.select("client_ip", "from_timestamp", "to_timestamp", "session_id", "unique_url_request").sort("unique_url_request", ascending=False).show(20, False)

20 Clients/IP sessions with most number of unique URL requests
+-------------+---------------------------+---------------------------+----------+------------------+
|client_ip    |from_timestamp             |to_timestamp               |session_id|unique_url_request|
+-------------+---------------------------+---------------------------+----------+------------------+
|119.81.61.166|2015-07-22T16:10:28.394091Z|2015-07-22T16:25:05.211263Z|8         |8016              |
|52.74.219.71 |2015-07-22T16:10:28.056562Z|2015-07-22T16:25:05.202249Z|8         |5478              |
|52.74.219.71 |2015-07-22T10:30:28.220275Z|2015-07-22T10:39:47.427020Z|5         |5057              |
|106.186.23.95|2015-07-22T21:05:28.048908Z|2015-07-22T21:10:27.952944Z|12        |4656              |
|119.81.61.166|2015-07-22T17:40:28.042128Z|2015-07-22T17:45:28.037648Z|10        |3928              |
|119.81.61.166|2015-07-22T18:00:28.603200Z|2015-07-22T18:05:27.768298Z|11        |3637              |
|119.81.61.166|2015

4) Find the most engaged users, ie the IPs with the longest session times.
------------

Changes required to extract average session time per user.
Each IP represents a user.

In [23]:
print("20 Most Engaged Users (IP with largest total session times)")
df_2.groupby("client_ip").agg((F.sum("session_length")/F.lit(60.0)).alias("total_session_length (Minutes)")).sort("total_session_length (Minutes)", ascending=False).show(20, False)

20 Most Engaged Users (IP with largest total session times)
+---------------+------------------------------+
|client_ip      |total_session_length (Minutes)|
+---------------+------------------------------+
|54.251.151.39  |87.38333333333334             |
|121.58.175.128 |82.63333333333334             |
|220.226.206.7  |81.66666666666667             |
|180.179.213.94 |73.3                          |
|52.74.219.71   |71.55                         |
|119.81.61.166  |71.45                         |
|54.252.254.204 |71.38333333333334             |
|122.252.231.14 |69.65                         |
|180.179.213.71 |69.23333333333333             |
|207.46.13.22   |67.7                          |
|106.186.23.95  |66.1                          |
|176.34.159.236 |65.66666666666667             |
|54.255.254.236 |64.96666666666667             |
|168.235.197.212|63.86666666666667             |
|54.232.40.76   |63.56666666666667             |
|54.243.31.236  |63.1                          |
|116.50.5

Additional questions for Machine Learning Engineer (MLE) candidates:
---------------

In [None]:
df_t_1 = df_1.withColumn("hour", df.timestamp.substr(12,2).cast('integer'))\
.withColumn("minute", df.timestamp.substr(15,2).cast('integer'))\
.withColumn("timestamp", F.col("timestamp").substr(0,16).cast('timestamp')).withColumn("client_ip", F.split(F.col("client_port"), ':')[0])\


df_t_2 = df_t_1.groupby("timestamp").agg((F.count("timestamp")/60).alias("load"), 
                                F.avg("received_bytes").alias("received_bytes"), 
                                F.avg("sent_bytes").alias("sent_bytes"), 
                                F.countDistinct("request").alias("unique_request_count"), 
                                F.countDistinct("client_ip").alias("unique_ip_count"), 
                                F.first("hour").alias("hour"), 
                                F.first("minute").alias("minute")).sort(["hour", "minute"]).cache()

In [None]:
w = Window.partitionBy().orderBy(["hour", "minute"])
df_t_3 = df_t_2.select("hour", "minute", F.lag("sent_bytes").over(w).alias("prev_sent_bytes"), F.lag("received_bytes").over(w).alias("prev_received_bytes"), F.lag("unique_request_count").over(w).alias("prev_unique_request_count"), F.lag("unique_ip_count").over(w).alias("prev_unique_ip_count"), "load").where(F.col("prev_sent_bytes").isNotNull()).cache()

In [None]:
train, test = df_t_3.randomSplit([0.8, 0.2], seed=2019)
training = train.withColumn("label", F.col("load"))

1) Predict the expected load (requests/second) in the next minute
----------------

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["hour", "minute", "prev_sent_bytes", "prev_received_bytes", 
                                       "prev_unique_request_count", "prev_unique_ip_count"], outputCol="vectorized")

# df_t_4 = assembler.transform(df_t_3)
polyExpansion = PolynomialExpansion(inputCol="vectorized", outputCol="features")
lr = LinearRegression(maxIter=100, featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[assembler, polyExpansion, lr])


paramGrid = ParamGridBuilder().addGrid(polyExpansion.degree, [1, 2, 3])\
.addGrid(lr.regParam, [0.1, 0.01, 0.001]).build()
# .addGrid(lr.fitIntercept, [False, True])\
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
# .build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)


In [None]:
print("Best model degree of polynomail = {}".format(cvModel.bestModel.stages[1]._java_obj.getDegree()))
print("Best model regParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getRegParam()))
# print("Best model fitIntercept value = {}".format(cvModel.bestModel.stages[2]._java_obj.getFitIntercept()))
# print("Best model elasticNetParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getElasticNetParam()))

In [32]:
prediction.select("load", "prediction").show(20, False)

+--------------------+-------------------+
|load                |prediction         |
+--------------------+-------------------+
|78.0                |79.862267121874    |
|25.55               |192.05373819449298 |
|0.06666666666666667 |36.33467860900592  |
|224.55              |179.4891507993166  |
|390.18333333333334  |360.7349784057594  |
|207.35              |120.52586536784167 |
|410.7               |305.62444210653194 |
|348.5833333333333   |326.5574258886327  |
|192.83333333333334  |80.8198241981577   |
|405.35              |208.47472550670767 |
|381.4166666666667   |322.43669778166833 |
|0.5                 |-19.935719146360526|
|0.5166666666666667  |8.099124287187664  |
|0.85                |70.7646618735991   |
|0.03333333333333333 |68.31117260945405  |
|350.8666666666667   |315.54429520967676 |
|33.0                |330.0107539521547  |
|0.016666666666666666|70.35487773400456  |
|281.9               |336.5753369735944  |
|0.03333333333333333 |267.7494050531194  |
+----------

In [33]:
trainingSummary = cvModel.bestModel.stages[2].summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 99.107936
r2: 0.582141


In [35]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["hour", "minute", "prev_sent_bytes", "prev_received_bytes", 
                                       "prev_unique_request_count", "prev_unique_ip_count"], outputCol="vectorized")

# df_t_4 = assembler.transform(df_t_3)
polyExpansion = PolynomialExpansion(inputCol="vectorized", outputCol="features")
lr = gbt = GBTRegressor(maxIter=100, seed=42, featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[assembler, polyExpansion, lr])


paramGrid = ParamGridBuilder().addGrid(polyExpansion.degree, [1, 2, 3]).build()
# .addGrid(lr.regParam, [0.1, 0.01, 0.001]).build()
# .addGrid(lr.fitIntercept, [False, True])\
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
# .build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)


KeyboardInterrupt: 

2) Predict the session length for a given IP
------------

In [307]:
df_1 = df.withColumn("client_ip", F.split(F.col("client_port"), ':')[0]).withColumn("timestamp", F.col("timestamp").substr(0,19).cast('timestamp'))
df_2 = sessionize(df_1)

In [309]:
df_3 = df_2.groupby("client_ip", "session_id").agg((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp"))).alias("session_length"), F.countDistinct("request").alias("unique_url"), F.avg("sent_bytes").alias("sent_bytes"), F.avg("received_bytes").alias("received_bytes"), F.avg("response_processing_time").alias("response_processing_time"))
stats = df_3.agg(F.mean("sent_bytes").alias("sent_bytes"), F.mean("received_bytes").alias("received_bytes"), F.mean("response_processing_time").alias("response_processing_time")).collect()[0]
avg_sent_bytes, avg_received_bytes, avg_response_processing_time = [stats["sent_bytes"], stats["received_bytes"], stats["response_processing_time"]]


In [315]:
w = Window.partitionBy("client_ip").orderBy(["session_id"])
df_4 = df_3.select("client_ip", "session_id", "session_length", "unique_url", \
                   F.coalesce(F.lag("sent_bytes").over(w), F.lit(avg_sent_bytes)).alias("prev_sent_bytes"), \
                   F.coalesce(F.lag("received_bytes").over(w), F.lit(avg_received_bytes)).alias("prev_received_bytes"), \
                   F.coalesce(F.lag("response_processing_time").over(w), F.lit(avg_response_processing_time)).alias("prev_response_processing_time"))

train, test = df_4.randomSplit([0.8, 0.2], seed=2019)
training = train.withColumn("label", F.col("session_length"))

In [317]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["prev_sent_bytes", "prev_received_bytes", 
                                       "prev_response_processing_time"], outputCol="vectorized")

polyExpansion = PolynomialExpansion(inputCol="vectorized", outputCol="features")
lr = LinearRegression(maxIter=100, featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[assembler, polyExpansion, lr])


paramGrid = ParamGridBuilder().addGrid(polyExpansion.degree, [1, 2, 3])\
.addGrid(lr.regParam, [0.1, 0.01, 0.001]).build()
# .addGrid(lr.fitIntercept, [False, True])\
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
# .build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)

In [318]:
print("Best model degree of polynomail = {}".format(cvModel.bestModel.stages[1]._java_obj.getDegree()))
print("Best model regParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getRegParam()))
# print("Best model fitIntercept value = {}".format(cvModel.bestModel.stages[2]._java_obj.getFitIntercept()))
# print("Best model elasticNetParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getElasticNetParam()))

Best model degree of polynomail = 1
Best model regParam value = 0.1
Best model fitIntercept value = True
Best model elasticNetParam value = 0.0


In [320]:
prediction.select("session_length", "prediction").show(20, False)

+--------------+-----------------+
|session_length|prediction       |
+--------------+-----------------+
|13            |79.78774792224259|
|0             |79.78774792224259|
|41            |79.78774792224259|
|2             |79.78774792224259|
|17            |79.78774792224259|
|625           |79.78774792224259|
|0             |79.78774792224259|
|4             |76.94537577309636|
|0             |79.78774792224259|
|47            |79.78774792224259|
|109           |79.78774792224259|
|600           |77.20623792486131|
|55            |82.76495756548657|
|75            |76.85845669144123|
|238           |78.37433733301162|
|28            |81.36845204146012|
|298           |78.29581346710376|
|3             |79.78774792224259|
|0             |79.78774792224259|
|80            |79.78774792224259|
+--------------+-----------------+
only showing top 20 rows



In [321]:
trainingSummary = cvModel.bestModel.stages[2].summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 148.044368
r2: 0.000187


3) Predict the number of unique URL visits by a given IP
------------------

In [322]:
training = train.withColumn("label", F.col("unique_url"))

In [323]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["prev_sent_bytes", "prev_received_bytes", 
                                       "prev_response_processing_time"], outputCol="vectorized")

polyExpansion = PolynomialExpansion(inputCol="vectorized", outputCol="features")
lr = LinearRegression(maxIter=100, featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[assembler, polyExpansion, lr])


paramGrid = ParamGridBuilder().addGrid(polyExpansion.degree, [1, 2, 3])\
.addGrid(lr.regParam, [0.1, 0.01, 0.001]).build()
# .addGrid(lr.fitIntercept, [False, True])\
# .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
# .build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5)
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)

KeyboardInterrupt: 

In [None]:
print("Best model degree of polynomail = {}".format(cvModel.bestModel.stages[1]._java_obj.getDegree()))
print("Best model regParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getRegParam()))
# print("Best model fitIntercept value = {}".format(cvModel.bestModel.stages[2]._java_obj.getFitIntercept()))
# print("Best model elasticNetParam value = {}".format(cvModel.bestModel.stages[2]._java_obj.getElasticNetParam()))

In [None]:
prediction.select("unique_url", "prediction").show(20, False)

In [None]:
trainingSummary = cvModel.bestModel.stages[2].summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [148]:
from pyspark.sql import Row
t = spark.createDataFrame([Row(date='2016-01-01', get_avg=5, get_first=1),
                            Row(date='2016-01-01', get_avg=5, get_first=2),
                            Row(date='2016-01-02', get_avg=10, get_first=3),
                            Row(date='2016-01-02', get_avg=20, get_first=3),
                            Row(date='2016-01-10', get_avg=30, get_first=3),
                            Row(date='2016-01-10', get_avg=10, get_first=3),
                            Row(date='2016-01-10', get_avg=20, get_first=3),
                            Row(date='2016-01-12', get_avg=30, get_first=3),
                            Row(date='2016-01-12', get_avg=8, get_first=4)])

In [154]:
t.select(F.unix_timestamp("date", 'yyyy-MM-dd')-F.unix_timestamp(F.lit("2016-01-01"), 'yyyy-MM-dd')).show()

+---------------------------------------------------------------------------+
|(unix_timestamp(date, yyyy-MM-dd) - unix_timestamp(2016-01-01, yyyy-MM-dd))|
+---------------------------------------------------------------------------+
|                                                                          0|
|                                                                          0|
|                                                                      86400|
|                                                                      86400|
|                                                                     777600|
|                                                                     777600|
|                                                                     777600|
|                                                                     950400|
|                                                                     950400|
+---------------------------------------------------------------