In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import shutil

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField, StringType
from pyspark.sql import Window

In [3]:
first_model_output_path = '../data/first_level_output_prq'
train_als_path = "../data/train_als"
train_cb_path = "../data/train_cb"

In [4]:
@udf(returnType=IntegerType())
def NotEmpty(x):
    if x == None:
        return 0
    elif len(x) == 0:
        return 1
    else:
        return 2

In [5]:
@udf(returnType=ArrayType(IntegerType()))
def get_film_ids(arr):
    return [x[0] for x in arr]

In [6]:
spark = SparkSession.builder \
            .appName("DataPrepare") \
            .getOrCreate()

In [7]:
pred_df = spark.read.parquet(first_model_output_path)
train_als = spark.read.parquet(train_als_path)
train_cb = spark.read.parquet(train_cb_path)

### Testing dataset

In [9]:
pred_df = pred_df.withColumn("rec_films", get_film_ids(col('recommendations')))

user_watched_films = train_als.groupBy('user_id').agg(collect_list('item_id').alias('watched_films'))

pred_df = pred_df.join(user_watched_films, on='user_id', how='left')

pred_df = pred_df.withColumn('new_films', array_except('rec_films', 'watched_films'))

user_future_films = train_cb.groupBy('user_id').agg(collect_list('item_id').alias('future_films'))

pred_df = pred_df.join(user_future_films, on='user_id', how='left')

rec_intersect = pred_df.select('user_id', array_intersect('new_films', 'future_films').alias('rec_intersection'))

In [10]:
rec_intersect = rec_intersect.withColumn('intersection_len', NotEmpty('rec_intersection'))
s = pred_df.count()

In [11]:
rec_intersect.groupBy('intersection_len').count().withColumn('count', col('count')/s * 100).show()

+----------------+-----------------+
|intersection_len|            count|
+----------------+-----------------+
|               1|34.58149779735683|
|               2|60.57268722466961|
|               0|4.845814977973569|
+----------------+-----------------+



### Build dataset for second model

In [12]:
train = pred_df.select('user_id',
                       array_intersect('new_films', 'future_films').alias('positives'),
                      array_except('new_films', 'future_films').alias('negatives'))

In [13]:
train_negatives = train.select('user_id', explode(col('negatives')).alias('item_id'), lit(0).alias('target'))

train_positives = train.select('user_id', explode(col('positives')).alias('item_id'), lit(1).alias('target'))

train = train_positives.unionAll(train_negatives)

In [14]:
train.show()

+-------+-------+------+
|user_id|item_id|target|
+-------+-------+------+
|  36525|   7079|     1|
|   8389|   7931|     1|
|  78400|   2714|     1|
|  78400|   3336|     1|
|  51595|   1364|     1|
|  40574|  10061|     1|
|  46994|   8152|     1|
|  84812|   1521|     1|
|  61793|   3916|     1|
|  74852|   1521|     1|
|  74852|   4171|     1|
|  43714|   9467|     1|
|  43714|   3839|     1|
|  32539|    402|     1|
|  16339|     72|     1|
|  96393|  10084|     1|
|   2122|   1016|     1|
|   2122|   5035|     1|
|  70355|   7185|     1|
|  70355|   3101|     1|
+-------+-------+------+
only showing top 20 rows

