# LSML1 SGA

> Large Scale Machine Learning 1 (Spring23)

> Sergey Terskov

# Solution 2. Spark RDD

In [4]:
%%time

#Initialize Spark
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType

sc = SparkContext(appName="LSML1_SGA")
se = SparkSession(sc)

# Read data from HDFS
events_df = se.read.format("csv") \
      .options(delimiter="\t", header=True) \
      .schema("user_id bigint, session_id bigint, event_type string, event_page string, timestamp bigint") \
      .load("hdfs:///data//clickstream.csv")

print("Initial dataset example:")
events_df.show(5)

# Create routes for sessions
def create_route(session_events):
    route = []

    for event_type, event_page in session_events:
        if "error" in event_type:
            break
            
        route.append(event_page)

    return "-".join(route)

create_route_udf = F.udf(create_route, StringType())

# Group events by sessions
routes_df = events_df\
    .filter(events_df.event_type != "event")\
    .sort(events_df.timestamp.asc())\
    .groupBy("user_id", "session_id")\
    .agg(F.collect_list(F.struct("event_type", "event_page")).alias("events"))

# Convert sessions to routes 
routes_df = routes_df.withColumn("route", create_route_udf(routes_df["events"]))

# Get the most frequent routes
top_routes_df = routes_df.groupBy("route").count().orderBy("count", ascending=False).limit(30)

# Show result
top_routes_df.show()

# Show result
print("Top 30 frequent routes:")
top_routes_df.show(30)

# Save to csv
top_routes_df.toPandas().to_csv("solution_3_DF.csv", sep="\t")

#Stop Spark session
se.stop()

2024-10-30 20:34:09,400 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


Initial dataset example:


                                                                                

+-------+----------+------------+----------+----------+
|user_id|session_id|  event_type|event_page| timestamp|
+-------+----------+------------+----------+----------+
|    562|       507|        page|      main|1695584127|
|    562|       507|       event|      main|1695584134|
|    562|       507|       event|      main|1695584144|
|    562|       507|       event|      main|1695584147|
|    562|       507|wNaxLlerrorU|      main|1695584154|
+-------+----------+------------+----------+----------+
only showing top 5 rows



                                                                                

+--------------------+-----+
|               route|count|
+--------------------+-----+
|                main| 8054|
|        main-archive| 1098|
|         main-rabota| 1034|
|       main-internet|  881|
|          main-bonus|  854|
|           main-news|  760|
|        main-tariffs|  670|
|         main-online|  582|
|          main-vklad|  509|
| main-rabota-archive|  168|
| main-archive-rabota|  167|
|  main-bonus-archive|  140|
|   main-rabota-bonus|  138|
|    main-news-rabota|  134|
|   main-bonus-rabota|  131|
|main-internet-rabota|  130|
|    main-rabota-news|  130|
|main-archive-inte...|  129|
|   main-archive-news|  127|
|main-rabota-internet|  123|
+--------------------+-----+
only showing top 20 rows

Top 30 frequent routes:


                                                                                

+--------------------+-----+
|               route|count|
+--------------------+-----+
|                main| 8060|
|        main-archive| 1093|
|         main-rabota| 1036|
|       main-internet|  880|
|          main-bonus|  858|
|           main-news|  760|
|        main-tariffs|  669|
|         main-online|  585|
|          main-vklad|  510|
| main-archive-rabota|  168|
| main-rabota-archive|  166|
|  main-bonus-archive|  141|
|   main-rabota-bonus|  136|
|    main-news-rabota|  133|
|   main-bonus-rabota|  132|
|main-archive-inte...|  129|
|    main-rabota-news|  128|
|main-internet-rabota|  127|
|   main-archive-news|  124|
|main-internet-arc...|  123|
|main-rabota-internet|  122|
|  main-archive-bonus|  117|
| main-internet-bonus|  114|
|main-tariffs-inte...|  113|
|   main-news-archive|  112|
|  main-news-internet|  109|
|main-tariffs-archive|  101|
|main-archive-tariffs|  100|
|  main-internet-news|   99|
|           main-main|   96|
+--------------------+-----+



                                                                                

CPU times: user 145 ms, sys: 170 ms, total: 315 ms
Wall time: 1min 9s
