## Additional questions for Machine Learning Engineer (MLE) candidates
1. Predict the expected load (requests/second) in the next minute <br>
2. Predict the session length for a given IP<br>
3. Predict the number of unique URL visits by a given IP<br>

## Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import col, countDistinct

from pyspark.sql.functions import max,min

In [2]:
# SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Predictive Analytics") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()
sc = spark.sparkContext

## Load Data

In [3]:
# read log file
rdd = sc.textFile('/Users/NK/Projects/pytm/2015_07_22_mktplace_shop_web_log_sample.log')
# split by " "
rdd = rdd.map(lambda line: line.split(" "))

In [4]:
# Keep timestamp, ip and link 
# convert the RDD to DF 
df = rdd.map(lambda line: Row(timestamp=line[0], ipaddress=line[2].split(':')[0], link=line[12])).toDF()
df.show(5)

+---------------+--------------------+--------------------+
|      ipaddress|                link|           timestamp|
+---------------+--------------------+--------------------+
|123.242.248.130|https://paytm.com...|2015-07-22T09:00:...|
|  203.91.211.44|https://paytm.com...|2015-07-22T09:00:...|
|    1.39.32.179|https://paytm.com...|2015-07-22T09:00:...|
| 180.179.213.94|https://paytm.com...|2015-07-22T09:00:...|
| 120.59.192.208|https://paytm.com...|2015-07-22T09:00:...|
+---------------+--------------------+--------------------+
only showing top 5 rows



In [38]:
df = df.withColumn('timestamp', df['timestamp'].cast(TimestampType()))
sorted_df = df.orderBy('timestamp', ascending=True)
sorted_df.show(10)

+---------------+--------------------+--------------------+
|      ipaddress|                link|           timestamp|
+---------------+--------------------+--------------------+
| 106.51.235.133|https://paytm.com...|2015-07-21 22:40:...|
| 115.250.16.146|https://paytm.com...|2015-07-21 22:40:...|
| 106.51.235.133|https://paytm.com...|2015-07-21 22:40:...|
|   52.74.219.71|https://paytm.com...|2015-07-21 22:40:...|
|  27.97.124.172|https://paytm.com...|2015-07-21 22:40:...|
| 106.78.125.179|https://paytm.com...|2015-07-21 22:40:...|
|   112.79.36.98|https://paytm.com...|2015-07-21 22:40:...|
|  119.81.61.166|https://paytm.com...|2015-07-21 22:40:...|
|117.197.179.139|https://paytm.com...|2015-07-21 22:40:...|
|    1.39.14.113|https://paytm.com...|2015-07-21 22:40:...|
+---------------+--------------------+--------------------+
only showing top 10 rows



## 2. Predict the session length for a given IP

In [39]:
# sessionizing data based on 30 min fixed window time
sorted_df_grpby = sorted_df.groupBy('ipaddress').count()
sorted_df_grpby.show(5)

+---------------+-----+
|      ipaddress|count|
+---------------+-----+
|117.202.104.218|    9|
|    61.3.119.16|    2|
|  106.220.18.37|    2|
|180.188.247.224|    1|
|  124.124.34.97|    4|
+---------------+-----+
only showing top 5 rows



In [40]:
df_session = sorted_df.join(sorted_df_grpby,['ipaddress'])
df_session.show(5)

+------------+--------------------+--------------------+-----+
|   ipaddress|                link|           timestamp|count|
+------------+--------------------+--------------------+-----+
|1.186.143.37|https://paytm.com...|2015-07-22 12:14:...|    2|
|1.186.143.37|https://paytm.com...|2015-07-22 12:14:...|    2|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|
+------------+--------------------+--------------------+-----+
only showing top 5 rows



In [41]:
indexed_df_session = df_session.withColumn("session_id", monotonically_increasing_id())
indexed_df_session.show(5)

+------------+--------------------+--------------------+-----+----------+
|   ipaddress|                link|           timestamp|count|session_id|
+------------+--------------------+--------------------+-----+----------+
|1.186.143.37|https://paytm.com...|2015-07-22 12:14:...|    2|         0|
|1.186.143.37|https://paytm.com...|2015-07-22 12:14:...|    2|         1|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|         2|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|         3|
|1.187.164.29|https://paytm.com...|2015-07-21 22:43:...|    9|         4|
+------------+--------------------+--------------------+-----+----------+
only showing top 5 rows



In [49]:
first_hit = indexed_df_session.groupBy("ipaddress").agg(min("timestamp").alias('first_hit'))
first_hit.show(5)

+------------+--------------------+
|   ipaddress|           first_hit|
+------------+--------------------+
|1.186.143.37|2015-07-22 12:14:...|
|1.187.164.29|2015-07-21 22:43:...|
|  1.22.41.76|2015-07-22 12:42:...|
| 1.23.208.26|2015-07-22 13:41:...|
| 1.23.36.184|2015-07-22 01:14:...|
+------------+--------------------+
only showing top 5 rows



In [50]:
df_sessions_timestamp = first_hit.join(indexed_df_session,['ipaddress'])
df_sessions_timestamp.show(5)

+------------+--------------------+--------------------+--------------------+-----+----------+
|   ipaddress|           first_hit|                link|           timestamp|count|session_id|
+------------+--------------------+--------------------+--------------------+-----+----------+
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         0|
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         1|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         2|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         3|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         4|
+------------+--------------------+--------------------+--------------------+-----+----------+
only showing top 5 rows



In [51]:
time_diff = (unix_timestamp(df_sessions_timestamp.timestamp)-unix_timestamp(df_sessions_timestamp.first_hit))
df_sessions_timestamp = df_sessions_timestamp.withColumn("time_diff", time_diff)
df_sessions_timestamp.show(5)

+------------+--------------------+--------------------+--------------------+-----+----------+---------+
|   ipaddress|           first_hit|                link|           timestamp|count|session_id|time_diff|
+------------+--------------------+--------------------+--------------------+-----+----------+---------+
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         0|        0|
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         1|        8|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         2|        0|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         3|       11|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         4|       16|
+------------+--------------------+--------------------+--------------------+-----+----------+---------+
only showing top 5 rows



In [52]:
bla = df_sessions_timestamp.groupBy('ipaddress').agg(max('time_diff').alias('session_len'))
bla.show(5)

+------------+-----------+
|   ipaddress|session_len|
+------------+-----------+
|1.186.143.37|          8|
|1.187.164.29|         69|
|  1.22.41.76|         62|
| 1.23.208.26|       1146|
| 1.23.36.184|         15|
+------------+-----------+
only showing top 5 rows



In [53]:
df_sessions_timestamp = df_sessions_timestamp.join(bla,['ipaddress'])
df_sessions_timestamp.show(5)

+------------+--------------------+--------------------+--------------------+-----+----------+---------+-----------+
|   ipaddress|           first_hit|                link|           timestamp|count|session_id|time_diff|session_len|
+------------+--------------------+--------------------+--------------------+-----+----------+---------+-----------+
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         0|        0|          8|
|1.186.143.37|2015-07-22 12:14:...|https://paytm.com...|2015-07-22 12:14:...|    2|         1|        8|          8|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         2|        0|         69|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         3|       11|         69|
|1.187.164.29|2015-07-21 22:43:...|https://paytm.com...|2015-07-21 22:43:...|    9|         4|       16|         69|
+------------+--------------------+--------------------+--------

## Feature Engineering

In [58]:
Feature_min = df_sessions_timestamp.groupBy('ipaddress').agg(min('time_diff').alias('min_session_len'))
Feature_max = df_sessions_timestamp.groupBy('ipaddress').agg(max('time_diff').alias('max_session_len'))
Feature_mean = df_sessions_timestamp.groupBy('ipaddress').agg(mean('time_diff').alias('mean_session_len'))
Feature_stddev = df_sessions_timestamp.groupBy('ipaddress').agg(stddev('time_diff').alias('std_session_len'))

In [60]:
Features = Feature_min.join(Feature_max,["ipaddress"])
Features = Features.join(Feature_stddev,["ipaddress"])
Features = Features.join(Feature_mean,["ipaddress"])
Features.show(20,False)

+------------+---------------+---------------+------------------+------------------+
|ipaddress   |min_session_len|max_session_len|std_session_len   |mean_session_len  |
+------------+---------------+---------------+------------------+------------------+
|1.186.143.37|0              |8              |5.656854249492381 |4.0               |
|1.187.164.29|0              |69             |23.377933565175898|38.55555555555556 |
|1.22.41.76  |0              |62             |24.57495821824671 |38.25             |
|1.23.208.26 |0              |1146           |466.24314114704856|194.33333333333334|
|1.23.36.184 |0              |15             |6.3508529610858835|6.5               |
|1.38.19.8   |0              |60             |42.42640687119285 |30.0              |
|1.38.20.34  |0              |192            |63.38906143210907 |89.94285714285714 |
|1.39.13.13  |0              |5              |3.5355339059327378|2.5               |
|1.39.32.249 |0              |20564          |10579.375414771264|

In [None]:
# y_count = [val.count for val in df2.select('count').collect()]
# x_timestamp = [val.time_range for val in df2.select('time_range').collect()]
# plt.plot(x_timestamp, y_count)

In [62]:
Features_target = Features.withColumnRenamed('mean_session_len','target')
Features_target.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ipaddress,90544,,,1.186.101.79,99.8.170.3
min_session_len,90544,0.0,0.0,0,0
max_session_len,90544,2663.60503180774,8632.87620561486,0,66621
std_session_len,90544,,,0.0,
target,90544,1379.1280325427504,4951.318122659066,0.0,56282.265625


In [64]:

Features_target_s = Features_target_s.na.drop()
Features_target_s.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
min_session_len,73813,0.0,0.0,0,0
max_session_len,73813,3267.3574302629618,9457.632774981359,0,66621
std_session_len,73813,1272.0292839273989,3740.1483604364134,0.0,46880.472485886916
target,73813,1691.7313830700662,5435.40746838811,0.0,56282.265625


In [65]:
Features_target_s = Features_target_s.drop('ipaddress')
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['max_session_len', 'std_session_len'], outputCol = 'features')
vfeatures_df = vectorAssembler.transform(Features_target_s)
vfeatures_df = vfeatures_df.select(['features', 'target'])
vfeatures_df.show(3)

+--------------------+-----------------+
|            features|           target|
+--------------------+-----------------+
|[8.0,5.6568542494...|              4.0|
|[69.0,23.37793356...|38.55555555555556|
|[62.0,24.57495821...|            38.25|
+--------------------+-----------------+
only showing top 3 rows



In [66]:
# Spliting data to train and test
splits = vfeatures_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

## Linear Regression

In [68]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='target', maxIter=10, regParam=0.3)
lr_model = lr.fit(train_df)
# Print the coefficients and intercept for linear regression model
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.5530189309079979,-0.09326190958689103]
Intercept: -3.25331278625


In [69]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 2301.123132
r2: 0.819973


In [71]:
# # Predictions
# lr_predictions = lr_model.transform(test_df)
# lr_predictions.select("prediction","target","features").show(5)

# from pyspark.ml.evaluation import RegressionEvaluator
# lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
#                  labelCol="target",metricName="r2")
# print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))