In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import shutil

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField, StringType
from pyspark.sql import Window

In [3]:
transactions_data_path = "../data/transactions.csv"
catalogue_path = "../data/catalogue.json"
test_path = "../data/test_users.json"
ratings_path = "../data/StepDan_ratings_prq"

In [4]:
spark = SparkSession.builder \
            .appName("OkkoRecSystem") \
            .getOrCreate()

In [5]:
df = spark.read.parquet(ratings_path)

In [6]:
userCol = 'user_uid'
itemCol = 'element_uid'
ratingCol = 'rate'


df = df.withColumnRenamed(itemCol, 'item_id')\
        .withColumnRenamed(ratingCol, 'rate')\
        .withColumnRenamed(userCol, 'user_id')

In [7]:
rating_df = df

rating_df = rating_df.withColumn("rate", col("rate").cast('int'))\
                    .withColumn("user_id", col("user_id").cast('int'))\
                    .withColumn("item_id", col("item_id").cast('int'))

In [8]:
rating_df = rating_df.withColumn("user_id", col("user_id").cast('int'))\
                    .withColumn("item_id", col("item_id").cast('int'))

In [9]:
film_cnt = rating_df.groupBy('user_id').count()\
            .withColumn('enough_films', col('count') >= 3)

In [10]:
rating_df = rating_df.join(film_cnt, on='user_id', how='left')\
            .where(col('enough_films') == True)

### First level model building

In [11]:
als = ALS(maxIter=10, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rate",
          coldStartStrategy="drop", implicitPrefs=True)

model = als.fit(rating_df)

In [12]:
@udf(returnType=ArrayType(IntegerType()))
def get_film_ids(arr):
    return [x[0] for x in arr]

### Test on boosters

In [13]:
!rm -f '../data/answerStepDan3.json'

In [14]:
with open(test_path, "r") as f:
    test = json.load(f)

cSchema = StructType([StructField('user_id', IntegerType(), False)])

test_users = list(map(lambda x: [x], test['users']))

test_df = spark.createDataFrame(test_users, schema=cSchema)

ans = model.recommendForUserSubset(test_df, 20)

ans = ans.select(col('user_id').cast(StringType()).alias('user_id'),
                 get_film_ids(col('recommendations')).alias('reccomendations'))

ans_df = ans.toPandas()

result = {}

for i in range(ans_df.shape[0]):
    result[ans_df.loc[i, 'user_id']] = ans_df.loc[i, 'reccomendations']

ans_df.index = ans_df.user_id

a = ans_df.reccomendations.to_json(orient = 'index', force_ascii=False)

with open('../data/answerStepDan3.json', "w") as f:
    json.dump(result, f)