In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
import re
# Load PySpark
spark = SparkSession.builder.appName('Analysis').getOrCreate()
sc = pyspark.SparkContext.getOrCreate()

In [2]:
regx = r"^(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) \S+ (\S+) \S+ (.*) (\S+) (\S+)$"
rdd = sc.textFile("data/2015_07_22_mktplace_shop_web_log_sample.log.gz").map(lambda x: re.split(regx, x)[1:16])
rdd_ = rdd.map(lambda x: (x[0], x[1], x[2], x[3], float(x[4]), float(x[5]), float(x[6]), int(x[7]), int(x[8]), int(x[9]), int(x[10]), x[11], x[12], x[13], x[14]))

In [3]:
Schema = T.StructType([T.StructField("timestamp", T.StringType(), True),
                                    T.StructField("elb", T.StringType(), True),
                                    T.StructField("client_port", T.StringType(), True),
                                    T.StructField("backend_port", T.StringType(), True),
                                    T.StructField("request_processing_time", T.DoubleType(), True),
                                    T.StructField("backend_processing_time", T.DoubleType(), True),
                                    T.StructField("response_processing_time", T.DoubleType(), True),
                                    T.StructField("elb_status_code", T.LongType(), True),
                                    T.StructField("backend_status_code", T.LongType(), True),
                                    T.StructField("received_bytes", T.LongType(), True),
                                    T.StructField("sent_bytes", T.LongType(), True),
                                    T.StructField("request", T.StringType(), True),
                                    T.StructField("user_agent", T.StringType(), True),
                                    T.StructField("ssl_cipher", T.StringType(), True),
                                    T.StructField("ssl_protocol", T.StringType(), True)])

In [4]:
# df = spark.read.csv("data/2015_07_22_mktplace_shop_web_log_sample.log.gz", Schema, sep = " ", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True).cache()
df = spark.createDataFrame(rdd_, schema=Schema)

In [5]:
def sessionize(df):
    time_frame = 15 * 60  # 15 mins * 60
    w = Window.partitionBy("client_ip").orderBy("timestamp")
    diff = F.coalesce(F.unix_timestamp(F.col("timestamp")) - F.unix_timestamp(F.lag(F.col("timestamp"), 1).over(w)), F.lit(0))
    cum_diff = F.sum(diff).over(w)
    subgroup = (cum_diff / time_frame).cast('integer').alias("session_id")
    return df.select("*", subgroup)

In [6]:
# Determine the average session time
def get_avg_session_time(df):
    df_ = df.groupby("client_ip", "session_id").agg(((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp")) + 1)/60).alias("session_length"))
    avg_session_time = df_.agg(F.avg("session_length").alias("avg_session_time")).collect()[0]["avg_session_time"]
    return avg_session_time

# Find the most engaged users, ie the IPs with the longest session times
def get_most_engaged_user(df):
    df_ = df.groupby("client_ip", "session_id").agg(F.min("timestamp").alias("from_timestamp"), F.max("timestamp").alias("to_timestamp"), ((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp")) + 1)/60).alias("session_length"))
    return df_.orderBy("session_length", ascending=False)

# Determine unique URL visits per session. To clarify, count a hit to a unique URL only once per session.
def get_unique_url_request(df):
    return df.groupby("client_ip", "session_id").agg(F.countDistinct("request").alias("unique_url_request"))

def analyze(df):
    df_ = df.groupby("client_ip", "session_id").agg(F.min("timestamp").alias("from_timestamp"), F.max("timestamp").alias("to_timestamp"), ((F.unix_timestamp(F.max("timestamp")) - F.unix_timestamp(F.min("timestamp")) + 1)/60).alias("session_length"), F.countDistinct("request").alias("unique_url_request"))
    return df_
    

In [7]:
df_1 = df.withColumn("client_ip", F.split(F.col("client_port"), ':')[0])\
.withColumn("timestamp", F.col("timestamp").substr(0,19).cast('timestamp')).select("client_ip", "timestamp", F.lower(F.col("request")).alias("request"))

df_2 = sessionize(df_1)
df_3 = analyze(df_2).cache()

In [8]:
print("Number of Distinct Sessions = {}".format(df_2.groupby("client_ip", "session_id").count().count()))
print("Average Session Time (Minutes) = {}".format(df_3.agg(F.avg("session_length").alias("avg_session_time")).collect()[0]["avg_session_time"]))


Number of Distinct Sessions = 115936
Average Session Time (Minutes) = 1.3433538043978264


In [9]:
print("Number of clients which made a single request to only 1 unique URL = {}".format(df_3.where("unique_url_request = 1").count()))

print()
print("20 Clients with most unique URL visits per session")
df_3.select("client_ip", "from_timestamp", "to_timestamp", "unique_url_request").sort("unique_url_request", ascending=False).show(20, False)

Number of clients which made a single request to only 1 unique URL = 25923

20 Clients with most unique URL visits per session
+-------------+-------------------+-------------------+------------------+
|client_ip    |from_timestamp     |to_timestamp       |unique_url_request|
+-------------+-------------------+-------------------+------------------+
|119.81.61.166|2015-07-22 16:10:28|2015-07-22 16:25:05|8016              |
|52.74.219.71 |2015-07-22 16:10:28|2015-07-22 16:25:05|5478              |
|52.74.219.71 |2015-07-22 10:30:28|2015-07-22 10:39:47|5057              |
|106.186.23.95|2015-07-22 21:05:28|2015-07-22 21:10:13|4320              |
|119.81.61.166|2015-07-22 17:40:28|2015-07-22 17:45:28|3928              |
|119.81.61.166|2015-07-22 18:00:28|2015-07-22 18:05:27|3637              |
|119.81.61.166|2015-07-22 02:40:06|2015-07-22 02:45:03|3334              |
|52.74.219.71 |2015-07-22 18:00:28|2015-07-22 18:05:27|2907              |
|119.81.61.166|2015-07-22 21:05:28|2015-07-22 21

In [10]:
print("Printing 20 Most Engaged Users with there session start and end time, session length in minutes")
df_3.select("client_ip", "from_timestamp", "to_timestamp", "session_length").sort("session_length", ascending=False).show(20, False)

Printing 20 Most Engaged Users with there session start and end time, session length in minutes
+---------------+-------------------+-------------------+--------------+
|client_ip      |from_timestamp     |to_timestamp       |session_length|
+---------------+-------------------+-------------------+--------------+
|125.16.58.78   |2015-07-22 10:45:45|2015-07-22 11:00:44|15.0          |
|220.227.161.206|2015-07-22 10:34:11|2015-07-22 10:49:10|15.0          |
|120.56.178.102 |2015-07-22 10:45:54|2015-07-22 11:00:53|15.0          |
|106.76.143.18  |2015-07-22 10:46:34|2015-07-22 11:01:33|15.0          |
|115.249.50.242 |2015-07-22 10:30:56|2015-07-22 10:45:55|15.0          |
|117.218.65.155 |2015-07-22 10:46:47|2015-07-22 11:01:46|15.0          |
|182.156.100.159|2015-07-22 10:46:26|2015-07-22 11:01:25|15.0          |
|165.241.31.254 |2015-07-22 10:48:58|2015-07-22 11:03:57|15.0          |
|117.218.41.153 |2015-07-22 10:31:02|2015-07-22 10:46:01|15.0          |
|112.79.37.154  |2015-07-22 