In [1]:
import os

# Get the list of files in the data directory
file_list = [f for f in os.listdir('data/train_parquet') if f.endswith('.parquet')]

# Prepend the directory path to each file name
file_list = [os.path.join('data/train_parquet', f) for f in file_list]


In [2]:
file_list

['data/train_parquet/000100000_000200000.parquet',
 'data/train_parquet/000900000_001000000.parquet',
 'data/train_parquet/012200000_012300000.parquet',
 'data/train_parquet/012100000_012200000.parquet',
 'data/train_parquet/011400000_011500000.parquet',
 'data/train_parquet/008800000_008900000.parquet',
 'data/train_parquet/002800000_002900000.parquet',
 'data/train_parquet/004000000_004100000.parquet',
 'data/train_parquet/007700000_007800000.parquet',
 'data/train_parquet/001700000_001800000.parquet',
 'data/train_parquet/009900000_010000000.parquet',
 'data/train_parquet/005500000_005600000.parquet',
 'data/train_parquet/010900000_011000000.parquet',
 'data/train_parquet/009700000_009800000.parquet',
 'data/train_parquet/005900000_006000000.parquet',
 'data/train_parquet/010400000_010500000.parquet',
 'data/train_parquet/002200000_002300000.parquet',
 'data/train_parquet/003700000_003800000.parquet',
 'data/train_parquet/001400000_001500000.parquet',
 'data/train_parquet/005000000_

In [7]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from tqdm import tqdm

# Create a SparkSession
#spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder \
    .config('spark.driver.memory', '24g') \
    .getOrCreate()

# Initialize an empty list to store the DataFrames
df_list = []

# Iterate through the list of Parquet files
for file in tqdm(file_list):
    # Read the Parquet file into a Spark DataFrame
    df = spark.read.parquet(file)

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate the DataFrames in the list
#df = df_list[0].union(df_list[1:])
df = df_list[0]

# Iterate through the rest of the DataFrames in the list
for i in range(1, len(df_list)):
    # Union the DataFrame with the next DataFrame in the list
    df = df.union(df_list[i])

100%|█████████████████████████████████████████| 129/129 [00:05<00:00, 21.64it/s]


In [8]:
df.show()

+-------+-------+----+
|session|    aid|type|
+-------+-------+----+
| 100000|1498214|   1|
| 100000|1617298|   1|
| 100000|1617298|   3|
| 100000|1820189|   1|
| 100000|1619534|   1|
| 100000|  22770|   1|
| 100000|  22770|   1|
| 100000|  22770|   3|
| 100000| 339965|   1|
| 100000| 339965|   3|
| 100000| 339965|   5|
| 100000|  22770|   5|
| 100000| 339965|   1|
| 100000| 339965|   1|
| 100000| 710289|   1|
| 100001|1104009|   1|
| 100001|1196408|   1|
| 100001| 822736|   1|
| 100001| 791744|   1|
| 100001| 822736|   1|
+-------+-------+----+
only showing top 20 rows



In [9]:
# Train the ALS model
als = ALS(maxIter=5, regParam=0.01, userCol="session", itemCol="aid", ratingCol="type")
model = als.fit(df)

23/01/08 23:40:31 WARN DAGScheduler: Broadcasting large task binary with size 1255.5 KiB
23/01/08 23:40:32 WARN DAGScheduler: Broadcasting large task binary with size 1255.5 KiB
23/01/08 23:40:32 WARN DAGScheduler: Broadcasting large task binary with size 1257.7 KiB




23/01/08 23:41:01 WARN DAGScheduler: Broadcasting large task binary with size 1259.3 KiB




23/01/08 23:41:25 WARN DAGScheduler: Broadcasting large task binary with size 1260.6 KiB


                                                                                

23/01/08 23:41:48 WARN DAGScheduler: Broadcasting large task binary with size 1259.6 KiB




23/01/08 23:42:14 WARN DAGScheduler: Broadcasting large task binary with size 1260.9 KiB


                                                                                

23/01/08 23:43:23 WARN DAGScheduler: Broadcasting large task binary with size 1261.6 KiB




23/01/08 23:43:44 WARN DAGScheduler: Broadcasting large task binary with size 1264.6 KiB




23/01/08 23:44:39 WARN DAGScheduler: Broadcasting large task binary with size 1266.0 KiB




23/01/08 23:45:55 WARN DAGScheduler: Broadcasting large task binary with size 1267.4 KiB




23/01/08 23:46:42 WARN DAGScheduler: Broadcasting large task binary with size 1268.8 KiB




23/01/08 23:47:53 WARN DAGScheduler: Broadcasting large task binary with size 1270.2 KiB




23/01/08 23:48:40 WARN DAGScheduler: Broadcasting large task binary with size 1271.6 KiB




23/01/08 23:49:51 WARN DAGScheduler: Broadcasting large task binary with size 1273.0 KiB




23/01/08 23:50:38 WARN DAGScheduler: Broadcasting large task binary with size 1274.3 KiB




23/01/08 23:51:49 WARN DAGScheduler: Broadcasting large task binary with size 1275.7 KiB




23/01/08 23:52:38 WARN DAGScheduler: Broadcasting large task binary with size 1277.7 KiB


                                                                                

23/01/08 23:53:37 WARN DAGScheduler: Broadcasting large task binary with size 1276.3 KiB


                                                                                

In [10]:
# Make recommendations for a user
recommendations = model.transform(df.filter(df.session == 42))

# Show the recommendations
recommendations.show()

23/01/08 23:57:37 WARN DAGScheduler: Broadcasting large task binary with size 1290.3 KiB
23/01/08 23:57:37 WARN DAGScheduler: Broadcasting large task binary with size 1288.9 KiB




23/01/08 23:57:45 WARN DAGScheduler: Broadcasting large task binary with size 1634.1 KiB




23/01/08 23:57:46 WARN DAGScheduler: Broadcasting large task binary with size 1648.7 KiB


                                                                                

23/01/08 23:57:47 WARN DAGScheduler: Broadcasting large task binary with size 1648.7 KiB
+-------+-------+----+----------+
|session|    aid|type|prediction|
+-------+-------+----+----------+
|     42| 979267|   1| 1.3249768|
|     42| 979267|   1| 1.3249768|
|     42|1648297|   1| 1.2555958|
|     42| 515648|   1| 1.2414193|
|     42| 645446|   1| 0.9764026|
|     42|1325950|   1| 1.3869649|
|     42|1745778|   1| 1.4822401|
|     42|  20376|   1| 1.0719389|
|     42| 925260|   1| 0.8904284|
|     42|1041839|   1| 1.6788143|
|     42|1263386|   1| 1.0875018|
|     42|1610035|   3| 1.4072871|
|     42|1779951|   1| 1.3508937|
|     42|1779951|   1| 1.3508937|
|     42|1779951|   3| 1.3508937|
|     42| 536610|   1| 1.1745937|
|     42| 670233|   1|0.91651434|
|     42|1228562|   1| 0.9530386|
|     42|1527697|   1|  1.309192|
|     42|1671956|   1| 1.1216995|
+-------+-------+----+----------+
only showing top 20 rows



In [11]:
spark.stop()