# BigData Final Project | Steam
## <font color = 'blue'> Notebook5 | Recommendation </font>
### Team Member: Jim Fang, WooJong Choi, Han Jeon, Tam Nguyen

June 2020
___

## Building recommendation engine using two methods:

1. Basket Analysis
2. ALS


### <font> CHALLENGE: </font>

> We dont have the rating for each game by each user, this make it hard to apply ALS model

### <font> SOLUTION: </font>

> We believe if an individual like a games, they would definitely more time play that game as compared to the others. As such, we decided to use total_playtime_forever as the 'rating' feature for our ALS model


<font color ='blue'> Our aim is to compare the difference between basket analysis and ALS on the top 10 recommend games for each cluster. Since we utilize the play_time_forever to each game </font>


---
## I. Import Libraries

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as t
from pyspark.sql.types import LongType
from pyspark.sql.types import StringType
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import isnan, when, count, col, size
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import length
from pyspark.sql import functions as sf
from pyspark.sql.functions import collect_set, collect_list, array_contains
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.functions import rank, dense_rank
from pyspark.sql.functions import desc
from functools import reduce
from pyspark.sql.functions import rand 

#Sparkml, spark mllib
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.mllib.recommendation import ALS
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import mean, stddev, col
import numpy as np
import pandas as pd
import seaborn as sns


from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.enableHiveSupport().appName('Clustering').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '64g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '32'), ('spark.cores.max', '32'), ('spark.driver.memory','64g')])

sc = spark.sparkContext

In [3]:
!hdfs dfs -ls /user/tamng/jwht/Cluster

Found 10 items
drwxr-xr-x   - tamng tamng          0 2020-06-05 17:47 /user/tamng/jwht/Cluster/als_df.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 20:05 /user/tamng/jwht/Cluster/basket_100_df.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:04 /user/tamng/jwht/Cluster/cluster_info.csv
drwxr-xr-x   - tamng tamng          0 2020-06-05 08:14 /user/tamng/jwht/Cluster/recommendation_basket
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:01 /user/tamng/jwht/Cluster/top20games_cluster0.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:01 /user/tamng/jwht/Cluster/top20games_cluster1.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:01 /user/tamng/jwht/Cluster/top20games_cluster2.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:01 /user/tamng/jwht/Cluster/top20games_cluster3.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:02 /user/tamng/jwht/Cluster/top20games_cluster4.csv
drwxr-xr-x   - tamng tamng          0 2020-06-04 14:02 /user/tamng/jwht/Clust

---
## II. Create Function

_Quick check missing value_

In [4]:
def check_missing(df):
    ''' Check missing value'''
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

_Basic description of data_

In [5]:
def basic_info(df):    
    '''
        Print out the basic ddescription for each table, icluding:
        1. total rows/ observation
        2. Check missing value by columns
        3. Print out the first 3 lines
        4. Basic description
    '''   
    print('TOTAL ROWS:', df.count())
    print('\n')
    print('*-------------'*5)
    print('\n')
    print('MISSING VALUE:')
    check_missing(df)
    print('*-------------'*5)
    print('\n')
    print('PRINT OUT THE 1st 3 LINES:')
    df.show(3, truncate = True)
    print('*-------------'*5)
    print('\n')
    print('TABLE BASIC DESCRIPTION:')
    df.describe().show(10,truncate = True)
    print('*-------------'*5)
    distinct_count = []
    column_name = df.columns
    for i in column_name:
        distinct_count.append(df.select(col(i)).distinct().count())

    print('DISTINCT COUNT BY COLUMN:')
    print('\n')
    print(pd.DataFrame(zip(column_name,distinct_count)).\
      rename(columns={0:'column_name', 1:'distinct_count'}))

_Rename columns based on predefined list_

In [6]:
def rename_col(df, newColumns):
    ''' Rename all columns        
        Note: newColumns is a list of columns name '''
    oldColumns = df.schema.names
    df = reduce(lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), df)
    return df

_Pipeline to run ALS recommendation model_

In [7]:
def als_model(df, cols, split_ratio = 0.8, maxIter = 10, regParam=0.1, \
              rank=8, nonnegative=True, coldStartStrategy="drop"):
    
    '''
    Pipleline function to select, split data, train model and run prediction:
        df: data
        cols: selected columns for model: steam_id, app_id, title, total_playtime_forever_construct
        split_ration: the ration that model will will to train
    '''
    
    # select columns for modeling
    ratings = df.select(cols)
    columns = ratings.columns

    # train test split
    train_df, test_df = ratings.randomSplit([split_ratio, 1 - split_ratio])

    # ALS recommendation model
    als = ALS(maxIter=maxIter, regParam=regParam, rank=rank, nonnegative=nonnegative,\
                  coldStartStrategy=coldStartStrategy, userCol=columns[0], itemCol=columns[1], ratingCol=columns[-1])

    model = als.fit(train_df)

    # predict on test
    predictions = model.transform(test_df)
    
    return predictions, model

_ALS model evaluation_

In [8]:
def als_evaluation(predictions,predictionCol,labelCol,metricName = 'rmse'):
    '''
    Model evaluation:
        predictionCol:
        labelCol:
        predictions:
    '''
    evaluator = RegressionEvaluator(metricName=metricName, predictionCol=predictionCol, labelCol=labelCol)
    rmse.append(evaluator.evaluate(predictions))
    return rmse

_Recommend top 10 games_

In [9]:
def recommend_top10_game(model):
    '''
    Recommend top 10 games using ALS model
        model: input ALS model
        recommend_game: empty list to append data
    '''
    
    recommend_game = []
    
    userRecs = model.recommendForAllUsers(10)
    
    for i in userRecs.toPandas().recommendations:
        for t in i:
            recommend_game.append(t[0])
            
    games = spark.createDataFrame(recommend_game, \
                                  IntegerType()).groupBy('value').count().orderBy('count', ascending = False)
    
    games = games.withColumnRenamed('value', 'app_id').join(game_title, 'app_id', how = 'left').orderBy('count', ascending = False)
    
    return games

---
## III. Import Data

In [10]:
# Load data
player_app_cluster = spark.read.csv('/user/tamng/jwht/EDA/player_app_cluster.csv', inferSchema = True,header = True)

In [11]:
player_app_cluster.limit(2).toPandas()

Unnamed: 0,steam_id,app_id,playtime_2weeks,playtime_forever,dateretrieved,title,type,price,releaseDate,rating,...,Early Access,Casual,Action,Strategy,Indie,Free to Play,RPG,Simulation,cluster,year
0,76561197960266975,10,0,0,2014-08-14 14:04:54,Counter-Strike,game,9.99,11/1/2000 0:00,88,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,2000
1,76561197960266975,20,0,0,2014-08-14 14:04:54,Team Fortress Classic,game,4.99,4/1/1999 0:00,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,1999


---
## IV. Recommendation

### 1. MarketBasket recommendation

---

### PROBLEM

Memory, coompute capacity does not allow the model can not handle if the number of games in the basket is to big, more than 30 games/ each inidivudal


### SOLUTION
> - Dropping all the steam_id and app_id with play_time_forever = 0
> - Only limitted the maximum number of game per user to 50, orderby the play_time_forever


### APPROACH
1. Sort by steam_id, playtime_forever
2. Create increasing index col for each steam_id
3. filter rows with index less than, or equal to 50

This will make sure the maximum number of game per user never go up to thousand

Question: How to ass increasing index for each steam_id
> - Using Window function
---

In [9]:
# playtime_forever > 0 only
game_recommend_df = player_app_cluster.filter(F.col('playtime_forever')!=0).orderBy('steam_id', 'playtime_forever', ascending = False)
game_recommend_df.count()

39002829

In [10]:
game_recommend_df.orderBy('steam_id', 'playtime_forever', ascending = False).limit(5).toPandas()

Unnamed: 0,steam_id,app_id,playtime_2weeks,playtime_forever,dateretrieved,title,type,price,releaseDate,rating,...,Early Access,Casual,Action,Strategy,Indie,Free to Play,RPG,Simulation,cluster,year
0,76561197975000941,440,0,32033,2014-08-16 22:56:04,Team Fortress 2,game,0.0,10/10/2007 0:00,92,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2,2007
1,76561197975000941,730,388,6291,2014-08-16 22:56:04,Counter-Strike: Global Offensive,game,14.99,8/21/2012 0:00,83,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,2012
2,76561197975000941,211420,0,5575,2014-08-16 22:56:04,DARK SOULS???: Prepare To Die??? Edition,game,19.99,8/23/2012 0:00,85,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2,2012
3,76561197975000941,105600,0,1630,2014-08-16 22:56:04,Terraria,game,9.99,5/16/2011 0:00,83,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2,2011
4,76561197975000941,72850,0,1489,2014-08-16 22:56:04,The Elder Scrolls V: Skyrim,game,19.99,11/10/2011 0:00,94,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2,2011


In [11]:
game_recommend_df.groupby('steam_id').count().orderBy('count', ascending = False).show()

+-----------------+-----+
|         steam_id|count|
+-----------------+-----+
|76561197968838120| 1672|
|76561197969068154| 1363|
|76561197971339745| 1318|
|76561197970644409| 1317|
|76561197967664774| 1288|
|76561197974742349| 1254|
|76561197971697469| 1223|
|76561197970535087| 1129|
|76561197969548941| 1089|
|76561197970575753| 1063|
|76561197966082557| 1059|
|76561197971944809| 1035|
|76561197960462697| 1034|
|76561197970307937| 1016|
|76561197960669982| 1000|
|76561197973925376|  995|
|76561197974783082|  989|
|76561197970717882|  987|
|76561197970663907|  987|
|76561197966750265|  985|
+-----------------+-----+
only showing top 20 rows



__Create a player_game_index column, based on that index, we can select the max top 30 game for each cluster player__

In [12]:
overCategory = Window.partitionBy("steam_id").orderBy(desc("playtime_forever"))
game_recommend_df = game_recommend_df.withColumn("playtime_forever_array", \
                                                 collect_list("playtime_forever").over(overCategory)).withColumn(
  "player_game_index", row_number().over(overCategory)).select('*')

In [13]:
game_recommend_df.limit(5).toPandas()

Unnamed: 0,steam_id,app_id,playtime_2weeks,playtime_forever,dateretrieved,title,type,price,releaseDate,rating,...,Action,Strategy,Indie,Free to Play,RPG,Simulation,cluster,year,playtime_forever_array,player_game_index
0,76561197960266870,10,0,6,2014-08-14 14:04:58,Counter-Strike,game,9.99,11/1/2000 0:00,88,...,1.0,0.0,0.0,0.0,0.0,0.0,4,2000,[6],1
1,76561197960266911,730,0,1453,2014-08-14 14:04:54,Counter-Strike: Global Offensive,game,14.99,8/21/2012 0:00,83,...,1.0,0.0,0.0,0.0,0.0,0.0,1,2012,[1453],1
2,76561197960266911,10,0,924,2014-08-14 14:04:54,Counter-Strike,game,9.99,11/1/2000 0:00,88,...,1.0,0.0,0.0,0.0,0.0,0.0,1,2000,"[1453, 924]",2
3,76561197960266911,240,0,889,2014-08-14 14:04:54,Counter-Strike: Source,game,19.99,11/1/2004 0:00,88,...,1.0,0.0,0.0,0.0,0.0,0.0,1,2004,"[1453, 924, 889]",3
4,76561197960268400,227300,0,148,2014-08-14 14:05:17,Euro Truck Simulator 2,game,34.99,1/16/2013 0:00,79,...,0.0,0.0,1.0,0.0,0.0,1.0,1,2013,[148],1


In [14]:
# Only limit to max 100 games per player (order by play_time_forever)
basket = game_recommend_df.filter(F.col('player_game_index')<31)
basket.count()

23039492

In [15]:
# Drop playtime_forever_array
basket = basket.drop('playtime_forever_array')

In [17]:
# Convert app_id to string type
basket = basket.withColumn('app_id', basket['app_id'].cast(StringType()))

In [18]:
# collect list of app, since some app will have the same name, we keep app_id instead of name
basket = basket.groupBy('steam_id').agg(collect_set('app_id').alias('game_id'))
basket.show()

+-----------------+--------------------+
|         steam_id|             game_id|
+-----------------+--------------------+
|76561197960266870|                [10]|
|76561197960266911|      [240, 730, 10]|
|76561197960268400|       [227300, 240]|
|76561197960269173|[245760, 244850, ...|
|76561197960269424|[65800, 63200, 22...|
|76561197960270130|            [80, 10]|
|76561197960271265|[208140, 33930, 1...|
|76561197960271435|[10180, 213670, 2...|
|76561197960272407|[33930, 259060, 3...|
|76561197960272849|  [10180, 10190, 10]|
|76561197960272869|[30, 300, 240, 73...|
|76561197960272973|[65800, 227860, 4...|
|76561197960273326|[8930, 65800, 201...|
|76561197960274486|[220, 63200, 1512...|
|76561197960274794|[238010, 630, 551...|
|76561197960275733|[340, 220, 33460,...|
|76561197960275854|[35140, 8850, 706...|
|76561197960276255|                [10]|
|76561197960276539|[340, 220, 8190, ...|
|76561197960277107|[42690, 240, 2064...|
+-----------------+--------------------+
only showing top

In [19]:
basket.count()

2666717

In [20]:
basket.rdd.getNumPartitions()

200

In [21]:
basket.createOrReplaceTempView('basket')
basket.show(truncate=100)

+-----------------+----------------------------------------------------------------------------------------------------+
|         steam_id|                                                                                             game_id|
+-----------------+----------------------------------------------------------------------------------------------------+
|76561197960266870|                                                                                                [10]|
|76561197960266911|                                                                                      [240, 730, 10]|
|76561197960268400|                                                                                       [227300, 240]|
|76561197960269173|[245760, 244850, 340, 420, 200110, 242050, 30, 273620, 380, 10, 12210, 70, 72850, 220, 246090, 24...|
|76561197960269424|[65800, 63200, 22300, 213670, 630, 223850, 200710, 72850, 570, 730, 99300, 50620, 550, 226320, 49...|
|76561197960270130|             

In [22]:
from pyspark.ml.fpm import FPGrowth

#set the minimum thresholds for support and confidence
fpGrowth = FPGrowth(itemsCol="game_id", minSupport=0.001, minConfidence=0)

model = fpGrowth.fit(basket)

In [23]:
#Calculate frequent itemsets
mostPopularItemInABasket = model.freqItemsets
mostPopularItemInABasket.createOrReplaceTempView("mostPopularItemInABasket")

# Display frequent itemsets.
model.freqItemsets.orderBy('freq', ascending = False).show(50,truncate =False)

+--------------+-------+
|items         |freq   |
+--------------+-------+
|[10]          |1234146|
|[240]         |1128126|
|[220]         |668700 |
|[440]         |585954 |
|[550]         |415647 |
|[240, 10]     |399089 |
|[730]         |376945 |
|[220, 240]    |376830 |
|[440, 240]    |343830 |
|[400]         |339022 |
|[72850]       |330802 |
|[570]         |314755 |
|[620]         |296338 |
|[70]          |283170 |
|[500]         |275736 |
|[730, 240]    |268173 |
|[8930]        |252661 |
|[30]          |250331 |
|[550, 440]    |238967 |
|[49520]       |234728 |
|[550, 240]    |230777 |
|[10180]       |229134 |
|[320]         |224518 |
|[10190]       |223644 |
|[440, 10]     |218829 |
|[730, 10]     |211987 |
|[300]         |205609 |
|[440, 220]    |202009 |
|[70, 10]      |199812 |
|[420]         |196124 |
|[380]         |194558 |
|[730, 440]    |194342 |
|[80]          |193454 |
|[10190, 10180]|193428 |
|[340]         |187940 |
|[30, 10]      |187210 |
|[630]         |181342 |


In [24]:
model.associationRules.show(50, truncate=False)

+-------------------------------+----------+-------------------+------------------+
|antecedent                     |consequent|confidence         |lift              |
+-------------------------------+----------+-------------------+------------------+
|[33930, 221100, 570, 550]      |[730]     |0.736241610738255  |5.208579552621966 |
|[33930, 221100, 570, 550]      |[240]     |0.7046979865771812 |1.6657980586132586|
|[33930, 221100, 570, 550]      |[440]     |0.7110738255033557 |3.2361459410206814|
|[33930, 221100, 570, 550]      |[10]      |0.48204697986577183|1.041597084953248 |
|[34330, 205100, 200510, 72850] |[8930]    |0.6855189558732132 |7.235327389068148 |
|[130, 50, 440, 220, 10]        |[20]      |0.4911169473196354 |11.887610282246472|
|[130, 50, 440, 220, 10]        |[380]     |0.5189247644059941 |7.112663015463046 |
|[130, 50, 440, 220, 10]        |[30]      |0.5413255059477831 |5.76661272173464  |
|[130, 50, 440, 220, 10]        |[340]     |0.542870384674803  |7.7028928573

In [25]:
#sort by confidence
model.associationRules.orderBy("confidence", ascending=False).show(50, truncate=False)

+------------------------------------------------------+----------+------------------+------------------+
|antecedent                                            |consequent|confidence        |lift              |
+------------------------------------------------------+----------+------------------+------------------+
|[35420, 630]                                          |[1250]    |0.996044588277598 |28.702618691367842|
|[42710, 630, 10180, 440, 240, 10]                     |[10190]   |0.9954782608695653|11.87002021691306 |
|[35420, 10]                                           |[1250]    |0.9954167860211973|28.684527564734434|
|[42690, 42710, 10180, 570, 730, 10]                   |[10190]   |0.9952675646159447|11.867507888027124|
|[42710, 42700, 630, 10180, 440, 240, 10]              |[10190]   |0.9951708766716196|11.866354987055818|
|[42690, 42710, 630, 10180, 240]                       |[10190]   |0.9950779327317474|11.865246729358297|
|[100, 340, 80, 320, 220, 240]                

In [26]:
#sort by lift
model.associationRules.orderBy("lift", ascending=False).show(50, truncate=False)

+-----------------------------------+----------+-------------------+------------------+
|antecedent                         |consequent|confidence         |lift              |
+-----------------------------------+----------+-------------------+------------------+
|[31190]                            |[31200]   |0.8228034087569791 |662.4951201057322 |
|[31200]                            |[31190]   |0.8454106280193237 |662.4951201057322 |
|[31190, 31170]                     |[31180]   |0.92850114791735   |594.9182594115117 |
|[31180, 31170]                     |[31190]   |0.7489417989417989 |586.8985681012863 |
|[31190]                            |[31180]   |0.8865706729356451 |568.0521588704768 |
|[31180]                            |[31190]   |0.7248918789043729 |568.0521588704768 |
|[31180]                            |[31200]   |0.6588178760211437 |530.4591877685617 |
|[31200]                            |[31180]   |0.8278985507246377 |530.4591877685616 |
|[65720, 33930]                 

In [27]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
recommendation_basket = model.transform(basket)

In [29]:
recommendation_basket.limit(50).toPandas()

Unnamed: 0,steam_id,game_id,prediction
0,76561197960266870,[10],"[72850, 230410, 63380, 41700, 200260, 320, 503..."
1,76561197960266911,"[240, 730, 10]","[17460, 22330, 280, 35140, 340, 238960, 207610..."
2,76561197960268400,"[227300, 240]","[24800, 47810, 30, 223670, 234650, 12100, 60, ..."
3,76561197960269173,"[245760, 244850, 340, 420, 200110, 242050, 30,...","[440, 630, 570, 10190, 500, 24960, 49520, 320,..."
4,76561197960269424,"[65800, 63200, 22300, 213670, 630, 223850, 200...","[240, 24960, 105600, 55230, 91310, 220240, 125..."
5,76561197960270130,"[80, 10]","[20, 42680, 130, 218620, 42710, 7940, 42690, 2..."
6,76561197960271265,"[208140, 33930, 10180, 221100, 22330, 35450, 1...","[107410, 630, 500, 220, 730, 49520, 550, 72850..."
7,76561197960271435,"[10180, 213670, 220240, 42710, 10190, 17410, 1...","[240, 550, 42680, 22380, 500, 49520, 70, 730, ..."
8,76561197960272407,"[33930, 259060, 35450, 236390, 42890, 34330, 2...","[17450, 240, 72850, 620, 49520, 8870, 17460, 5..."
9,76561197960272849,"[10180, 10190, 10]","[22380, 420, 200710, 50, 60, 55230, 42710, 130..."


In [42]:
recommendation_basket.groupby('game_id', 'prediction').count().orderBy('count',ascending = False).show()

+---------------+--------------------+------+
|        game_id|          prediction| count|
+---------------+--------------------+------+
|           [10]|[72850, 230410, 6...|430137|
|          [240]|[24800, 47810, 30...|184050|
|          [220]|[35720, 9930, 420...| 70774|
|      [240, 10]|[48700, 221380, 2...| 47841|
|     [220, 240]|[12210, 216250, 2...| 37528|
|       [80, 10]|[20, 42680, 130, ...| 35257|
|       [30, 10]|[630, 17500, 3545...| 30477|
|       [10, 70]|[48700, 221380, 1...| 25502|
|           [80]|[20, 42680, 130, ...| 23100|
|           [30]|[41500, 65800, 40...| 19402|
|          [300]|[200510, 236390, ...|  8090|
|          [380]|[630, 219740, 209...|  7808|
|     [240, 320]|[17460, 202990, 2...|  7675|
|           [70]|[48700, 221380, 1...|  7653|
|[220, 240, 320]|[12210, 216250, 2...|  7645|
|      [570, 10]|[72850, 230410, 6...|  7565|
|          [320]|[17460, 202990, 2...|  7299|
|   [30, 10, 70]|[48700, 221380, 1...|  7171|
|  [100, 80, 10]|[20, 42680, 130, 

#### Save predicton as pickle file

In [35]:
recommendation_basket.rdd.saveAsPickleFile('/user/tamng/jwht/Cluster/recommendation_basket')

#### Load pickle file

In [None]:
# pickleRdd = sc.pickleFile('/user/tamng/jwht/Cluster/recommendation_basket').collect()
# df2 = spark.createDataFrame(pickleRdd)

---
## 2. ALS

Here, we dont have the rating for games by each player; however, our assumption is if an individual likes a game they would defintely spend more time on that game. As such, we will utilize and reconstruct total_playtime_foerever for the game as the variable for ALS recommendation model 

### a. Basic description about the data

In [12]:
player_app_cluster.limit(2).toPandas()

Unnamed: 0,steam_id,app_id,playtime_2weeks,playtime_forever,dateretrieved,title,type,price,releaseDate,rating,...,Early Access,Casual,Action,Strategy,Indie,Free to Play,RPG,Simulation,cluster,year
0,76561197960266975,10,0,0,2014-08-14 14:04:54,Counter-Strike,game,9.99,11/1/2000 0:00,88,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,2000
1,76561197960266975,20,0,0,2014-08-14 14:04:54,Team Fortress Classic,game,4.99,4/1/1999 0:00,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,1999


In [14]:
als_df = player_app_cluster.select('steam_id', 'app_id', 'playtime_forever', 'title', 'cluster', 'year')

In [68]:
basic_info(als_df)

TOTAL ROWS: 82975759


*-------------*-------------*-------------*-------------*-------------


MISSING VALUE:
+--------+------+----------------+-----+-------+----+
|steam_id|app_id|playtime_forever|title|cluster|year|
+--------+------+----------------+-----+-------+----+
|       0|     0|               0|    0|      0|   0|
+--------+------+----------------+-----+-------+----+

*-------------*-------------*-------------*-------------*-------------


PRINT OUT THE 1st 3 LINES:
+-----------------+------+----------------+--------------------+-------+----+
|         steam_id|app_id|playtime_forever|               title|cluster|year|
+-----------------+------+----------------+--------------------+-------+----+
|76561197960266870|    10|               6|      Counter-Strike|      4|2000|
|76561197960266870|    20|               0|Team Fortress Cla...|      4|1999|
|76561197960266870|    30|               0|       Day of Defeat|      4|2003|
+-----------------+------+----------------+-------

---
### b. Construct data

Need to reIndex steamID and 

In [17]:
# create dataframe only include steam_id
steam_id_reindex = als_df.select('steam_id')
steam_id_reindex.show(5)

+-----------------+
|         steam_id|
+-----------------+
|76561197960266975|
|76561197960266975|
|76561197960266975|
|76561197960266975|
|76561197960266975|
+-----------------+
only showing top 5 rows



In [18]:
# get steam_id, drop duplicate
steam_id_reindex = steam_id_reindex.select('steam_id').dropDuplicates()

# reset steam_id index
steam_id_reindex = steam_id_reindex.withColumn('steam_index', \
                                        row_number().over(Window.orderBy(monotonically_increasing_id())))
# total observations
steam_id_reindex.count()

3125641

In [19]:
steam_id_reindex.show(10)

+-----------------+-----------+
|         steam_id|steam_index|
+-----------------+-----------+
|76561197960266870|          1|
|76561197960266911|          2|
|76561197960268400|          3|
|76561197960269173|          4|
|76561197960269352|          5|
|76561197960269424|          6|
|76561197960269449|          7|
|76561197960270130|          8|
|76561197960271265|          9|
|76561197960271435|         10|
+-----------------+-----------+
only showing top 10 rows



In [15]:
cols = ['steam_index', 'app_id', 'playtime_forever']
predictionCol = 'prediction'
labelCol = 'playtime_forever'

In [21]:
# Convert to double
als_df = als_df.withColumn('playtime_forever',als_df.playtime_forever.cast('double'))

In [22]:
# Join index and convert play_time to hour
als_df = als_df.join(steam_id_reindex, 'steam_id')
als_df = als_df.withColumn('playtime_forever', F.col('playtime_forever')/60)

In [20]:
als_df.limit(10).toPandas()

Unnamed: 0,steam_id,app_id,playtime_forever,title,cluster,year,steam_index
0,76561197960266870,10,0.1,Counter-Strike,4,2000,1
1,76561197960266870,20,0.0,Team Fortress Classic,4,1999,1
2,76561197960266870,30,0.0,Day of Defeat,4,2003,1
3,76561197960266870,40,0.0,Deathmatch Classic,4,2001,1
4,76561197960266870,50,0.0,Half-Life: Opposing Force,4,1999,1
5,76561197960266870,60,0.0,Ricochet,4,2000,1
6,76561197960266870,70,0.0,Half-Life,4,1998,1
7,76561197960266870,130,0.0,Half-Life: Blue Shift,4,2001,1
8,76561197960266911,10,15.4,Counter-Strike,1,2000,2
9,76561197960266911,20,0.0,Team Fortress Classic,1,1999,2


In [23]:
# Filter app_id with playtime_forever > 0.0
als_df = als_df.filter(F.col('playtime_forever')!= 0.0).orderBy('steam_index', 'playtime_forever', ascending = False)
als_df.limit(10).toPandas()

Unnamed: 0,steam_id,app_id,playtime_forever,title,cluster,year,steam_index
0,76561197974999624,220,1.9,Half-Life 2,4,2004,3125640
1,76561197974999624,240,1.066667,Counter-Strike: Source,4,2004,3125640
2,76561197974999624,320,0.3,Half-Life 2: Deathmatch,4,2004,3125640
3,76561197974999470,10,145.566667,Counter-Strike,1,2000,3125639
4,76561197974999470,30,46.466667,Day of Defeat,1,2003,3125639
5,76561197974999470,70,0.133333,Half-Life,1,1998,3125639
6,76561197974999275,22380,59.716667,Fallout: New Vegas,1,2010,3125638
7,76561197974999275,10190,59.516667,Call of Duty??: Modern Warfare?? 2,1,2009,3125638
8,76561197974999275,40100,33.3,Supreme Commander 2,1,2010,3125638
9,76561197974999275,240,28.35,Counter-Strike: Source,1,2004,3125638


In [24]:
als_df.count()

39002829

In [25]:
# Number of unique steam id per cluster
als_df.select('steam_id','cluster').dropDuplicates().groupby('cluster').count().show()

+-------+-------+
|cluster|  count|
+-------+-------+
|      1|1466360|
|      3|  90659|
|      5|   1295|
|      4| 870647|
|      2| 203396|
|      0|  34360|
+-------+-------+



In [26]:
# Number of rows per cluster ( combine of user + games)
als_df.groupby('cluster').count().show()

+-------+--------+
|cluster|   count|
+-------+--------+
|      1|14091393|
|      3| 1994660|
|      5|  434120|
|      4| 1364921|
|      2|15192457|
|      0| 5925278|
+-------+--------+



In [27]:
# Save the data
als_df.write.csv('/user/tamng/jwht/Cluster/als_df.csv', header = True)

In [10]:
# Reload the data
als_df = spark.read.csv('/user/tamng/jwht/Cluster/als_df.csv', inferSchema = True, header = True)

---
### c. Run model

Based on the summary table above, we decided to run the model on smaller cluster first, then, depend on the capacity of RCC - memory, runtime, we will decide how much data we keep to run on a bigger cluster

When we run the for loop, the memory run out many times, therfore, we decided to run model by model and reset the kernel 2-3 times to overcome this challenge

In [47]:
# Cluster 0
predictions_0, model_0 = als_model(als_df.filter(als_df.cluster ==0), cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

In [46]:
# Cluster 3
predictions_3, model_3 = als_model(als_df.filter(als_df.cluster ==3), cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

In [56]:
# Cluster 5
predictions_5, model_5 = als_model(als_df.filter(als_df.cluster ==5), cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

In [155]:
# Cluster 4
predictions_4, model_4 = als_model(als_df.filter(als_df.cluster ==4), cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

---

__Cluster 2: Random split, take 50% of data for modeling__

Since since cluster has more than 15 mil rows, we decided to randomize and take 50% of the data to run the model. 

In [22]:
als_df_cluster_2, als_df_cluster_2_remove = als_df.filter(F.col('cluster')==2).randomSplit([0.5, 0.5])
als_df_cluster_2.count()

7592019

In [23]:
predictions_2, model_2 = als_model(als_df_cluster_2, cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

---
__Cluster 1: Random split, take 40% of data for modeling__

Since since cluster has more than 14 mil rows, we decided to randomize and take 40% of the data to run the model. 

In [11]:
als_df_cluster_1, als_df_cluster_1_remove = als_df.filter(F.col('cluster')==1).randomSplit([0.4, 0.6])
als_df_cluster_1.count()

5638523

In [16]:
predictions_1, model_1 = als_model(als_df.filter(als_df.cluster ==1), cols, split_ratio = 0.8, maxIter = 10, \
                        regParam=0.1,rank=8, nonnegative=True, coldStartStrategy="drop")

---

### d. Construct RMSE

In [35]:
# Run als_evaluation
rmse_list = []

rmse_list = als_evaluation(predictions_0,predictionCol,labelCol,metricName = 'rmse')
rmse_list = als_evaluation(predictions_3,predictionCol,labelCol,metricName = 'rmse')
rmse_list = als_evaluation(predictions_5,predictionCol,labelCol,metricName = 'rmse')
rmse_list = als_evaluation(predictions_4,predictionCol,labelCol,metricName = 'rmse')
rmse_list = als_evaluation(predictions_2,predictionCol,labelCol,metricName = 'rmse')
rmse_list = als_evaluation(predictions_1,predictionCol,labelCol,metricName = 'rmse')

In [34]:
rmse_list

[80.45918321924412,
 505.6112931975578,
 98.36938727241011,
 100.76209073056927,
 97.83191069836455,
 137.55398863226412]

In [35]:
# Return RMSE dataframe
cluster = ['cluster_0','cluster_3','cluster_5','cluster_4','cluster_2','cluster_1']
rmse_df = pd.DataFrame(zip(cluster, rmse_list)).rename(columns={0:'Cluster', 1:'RMSE'})
rmse_df

Unnamed: 0,Cluster,RMSE
0,cluster_0,80.459183
1,cluster_3,505.611293
2,cluster_5,98.369387
3,cluster_4,100.762091
4,cluster_2,97.831911
5,cluster_1,137.553989


### e. Show prediction

In [50]:
predictions_0.show(5)

+-----------+------+------------------+----------+
|steam_index|app_id|  playtime_forever|prediction|
+-----------+------+------------------+----------+
|     737997|  4900| 4.266666666666667|       0.0|
|    2964256|  4900|18.316666666666666| 11.176373|
|    2246385|  4900|1.5166666666666666| 6.2655296|
|     518761|  4900|24.983333333333334|       0.0|
|    2123889|  4900|              0.85|  2.247658|
+-----------+------+------------------+----------+
only showing top 5 rows



In [55]:
predictions_3.show(10)

+-----------+------+------------------+----------+
|steam_index|app_id|  playtime_forever|prediction|
+-----------+------+------------------+----------+
|     811324|  4900| 4.416666666666667|  3314.707|
|    2120068|  4900| 3.466666666666667| 2171.2124|
|    2148378|  4900|              27.0| 4592.3887|
|     335917|  4900|100.28333333333333| 5.5308027|
|    2375591|  4900|            1025.8|126.540184|
|    2143574|  4900|0.4666666666666667|  2755.672|
|     206781|  4900| 8.233333333333333| 814.20135|
|    2024246|  4900|               0.1|       0.0|
|    2746450|  4900|21.666666666666668| 1825.4503|
|     722988|  4900|1.0833333333333333|  11152.96|
+-----------+------+------------------+----------+
only showing top 10 rows



In [59]:
predictions_5.show(10)

+-----------+------+------------------+----------+
|steam_index|app_id|  playtime_forever|prediction|
+-----------+------+------------------+----------+
|    1339828|  4900| 7.266666666666667| 0.3008237|
|     812123|  4900|               0.6|  1.081293|
|    1949141|  4900|0.5166666666666667|  5.919717|
|      58991|  4900|0.3333333333333333|   6.27068|
|     356349|  4900|               1.7| 1.1037369|
|    2282249|  4900|1.4666666666666666| 1.1649313|
|    3122548|  4900|1.1666666666666667| 2.6306977|
|    1136188|  4900|0.6666666666666666| 0.8951329|
|    1231473|  4900|               5.3| 5.5917764|
|    1048586|  9900|20.216666666666665|  8.696039|
+-----------+------+------------------+----------+
only showing top 10 rows



In [159]:
predictions_4.show(10)

+-----------+------+--------------------+----------+
|steam_index|app_id|    playtime_forever|prediction|
+-----------+------+--------------------+----------+
|    2032309|  9900|   5.316666666666666|  158.2825|
|     856368|  9900|  117.78333333333333|  7.161015|
|    2623779|  9900|                0.15| 29.764618|
|     488689|  9900|                31.6|  64.76349|
|    1929938|  9900|               12.55| 27.684433|
|    3075929|  9900|  17.133333333333333|       0.0|
|    1526727|  9900|   7.016666666666667| 16.368141|
|    1793926|  9900|0.016666666666666666|  7.973361|
|    1917644|  9900|               194.1| 51.470474|
|    1288484|  9900|   67.73333333333333| 1.6231619|
+-----------+------+--------------------+----------+
only showing top 10 rows



In [51]:
predictions_2.show(10)

+-----------+------+-------------------+----------+
|steam_index|app_id|   playtime_forever|prediction|
+-----------+------+-------------------+----------+
|     562738|  4900|  3.033333333333333|  67.27994|
|    2091640|  4900|                5.1| 226.08443|
|    1652511|  4900|               0.95| 28.370527|
|    1912107|  4900|0.08333333333333333| 79.703156|
|    2831656|  4900|0.08333333333333333| 53.089333|
|     888100|  4900|               12.5| 242.00189|
|    2835976|  4900|  48.71666666666667| 1.5052088|
|    1352421|  4900| 0.5333333333333333| 0.7748907|
|    1572885|  4900|               1.65| 60.898346|
|     119194|  4900| 0.5333333333333333| 2956.1743|
+-----------+------+-------------------+----------+
only showing top 10 rows



In [52]:
predictions_1.show(10)

+-----------+------+-------------------+-----------+
|steam_index|app_id|   playtime_forever| prediction|
+-----------+------+-------------------+-----------+
|    2044692|  4900| 107.41666666666667|  27.471344|
|     551670|  4900|0.16666666666666666|   105.5918|
|    1994918|  4900|               0.15|0.033719238|
|     456250|  4900|                0.2|   79.89779|
|     538543|  4900|               0.35|  53.241707|
|    3106598|  4900|               3.55|   67.99861|
|    1138296|  4900|              354.1|  11.132031|
|    2078330|  4900|0.16666666666666666|  13.484777|
|    2444637|  4900|0.36666666666666664|        0.0|
|    3030788|  4900|  5.283333333333333|  47.547173|
+-----------+------+-------------------+-----------+
only showing top 10 rows



---

### e. Recommend top 10 games for each cluster

In [17]:
# Game: app_id + title + year
game_title = als_df.select('app_id', 'title', 'year').dropDuplicates()
game_title.count()

3466

In [18]:
game_title.show(2)

+------+--------------------+----+
|app_id|               title|year|
+------+--------------------+----+
| 15520|AaAaAA!!! - A Rec...|2009|
|  9200|                RAGE|2011|
+------+--------------------+----+
only showing top 2 rows



In [19]:
recommend_game = []

def recommend_top10_game(model):
    userRecs = model.recommendForAllUsers(10)
    
    for i in userRecs.toPandas().recommendations:
        for t in i:
            recommend_game.append(t[0])
            
    games = spark.createDataFrame(recommend_game, \
                                  IntegerType()).groupBy('value').count().orderBy('count', ascending = False)
    
    games = games.withColumnRenamed('value', 'app_id').join(game_title, 'app_id', how = 'left').orderBy('count', ascending = False)
    
    return games

__Cluster 0__

In [133]:
games_0 = recommend_top10_game(model_0)

In [134]:
games_0.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,47410,91822,Stronghold Kingdoms,2012
1,36620,75588,Forsaken World,2011
2,227400,63631,Darkfall Unholy Wars,2013
3,46770,49917,Making History II: The War of the World,2010
4,212200,47844,Mabinogi,2012
5,8500,45498,EVE Online,2010
6,4900,39649,Zen of Sudoku,2006
7,209710,38734,War of the Immortals,2012
8,212160,34961,Vindictus,2012
9,243580,29117,Maya LT (with Stingray),2014


__Cluster 1__

In [20]:
games_1 = recommend_top10_game(model_1)

In [21]:
games_1.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,30,814381,Day of Defeat,2003
1,40,814381,Deathmatch Classic,2001
2,20,814381,Team Fortress Classic,1999
3,10,814376,Counter-Strike,2000
4,50,649566,Half-Life: Opposing Force,1999
5,60,565005,Ricochet,2000
6,227400,475006,Darkfall Unholy Wars,2013
7,70,449038,Half-Life,1998
8,1313,424343,SiN Episodes: Emergence,2006
9,3483,424343,Peggle Extreme,2007


__Cluster 2__

In [57]:
games_2 = recommend_top10_game(model_2)
games_2.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,47410,173069,Stronghold Kingdoms,2012
1,257650,168503,Mosaico,2013
2,212200,146413,Mabinogi,2012
3,38020,107726,Mahjong Quest Collection,2009
4,60340,100746,Luxor: 5th Passage,2011
5,46770,96939,Making History II: The War of the World,2010
6,212240,92533,Atlantica Online,2012
7,36620,82707,Forsaken World,2011
8,243580,75849,Maya LT (with Stingray),2014
9,113400,62684,APB Reloaded,2011


In [25]:
games_2 = recommend_top10_game(model_2)
games_2.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,20,814733,Team Fortress Classic,1999
1,30,814733,Day of Defeat,2003
2,10,814728,Counter-Strike,2000
3,40,814703,Deathmatch Classic,2001
4,50,649835,Half-Life: Opposing Force,1999
5,60,565271,Ricochet,2000
6,227400,532520,Darkfall Unholy Wars,2013
7,70,449276,Half-Life,1998
8,3483,424460,Peggle Extreme,2007
9,1313,424460,SiN Episodes: Emergence,2006


__Cluster3__

In [132]:
games_3 = recommend_top10_game(model_3)
games_3.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,36620,66308,Forsaken World,2011
1,227400,60904,Darkfall Unholy Wars,2013
2,47410,59022,Stronghold Kingdoms,2012
3,46770,49917,Making History II: The War of the World,2010
4,8500,41169,EVE Online,2010
5,4900,39649,Zen of Sudoku,2006
6,209710,37523,War of the Immortals,2012
7,212200,36005,Mabinogi,2012
8,212160,34961,Vindictus,2012
9,34460,25960,Civilization IV: Beyond the Sword,2007


__Cluster4__

In [161]:
games_4 = recommend_top10_game(model_4)
games_4.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,20,564493,Team Fortress Classic,1999
1,30,564492,Day of Defeat,2003
2,40,564492,Deathmatch Classic,2001
3,10,564485,Counter-Strike,2000
4,50,446848,Half-Life: Opposing Force,1999
5,60,403343,Ricochet,2000
6,70,309773,Half-Life,1998
7,1313,281034,SiN Episodes: Emergence,2006
8,3483,281034,Peggle Extreme,2007
9,2545,280336,RIP - Trilogy???,2007


__Cluster 5__

In [128]:
games_5 = recommend_top10_game(model_5)
games_5.limit(15).toPandas()

Unnamed: 0,app_id,count,title,year
0,257650,1053,Mosaico,2013
1,22490,713,Fallout: New Vegas,2010
2,39120,569,RIFT,2011
3,113400,523,APB Reloaded,2011
4,227320,511,You Need a Budget 4,2012
5,570,509,Dota 2,2013
6,109600,487,Neverwinter,2013
7,39210,469,FINAL FANTASY?? XIV: A Realm Reborn???,2014
8,440,431,Team Fortress 2,2007
9,10190,415,Call of Duty??: Modern Warfare?? 2,2009


----

### Scraping data

Leave it there for now

In [78]:
# scraping data
scraping = spark.read.csv('/user/tamng/jwht/SteamData/Steam_Scraping.csv',inferSchema = True, header = True)

In [73]:
scraping = scraping.withColumn('release_date', \
                              (scraping.release_date).cast('timestamp'))

In [75]:
# Change format to timestamp
scraping = scraping.withColumn('YearReleased', \
                               year(F.col('release_date')))

In [76]:
scraping.printSchema()

root
 |-- game_url: string (nullable = true)
 |-- img_url: string (nullable = true)
 |-- app_id: integer (nullable = true)
 |-- game_name: string (nullable = true)
 |-- release_date: timestamp (nullable = true)
 |-- platforms: string (nullable = true)
 |-- reviews_summary: string (nullable = true)
 |-- original_price: string (nullable = true)
 |-- discount_rate: string (nullable = true)
 |-- discounted_price: string (nullable = true)
 |-- YearReleased: integer (nullable = true)



In [79]:
scraping.limit(5).toPandas()

Unnamed: 0,game_url,img_url,app_id,game_name,release_date,platforms,reviews_summary,original_price,discount_rate,discounted_price
0,https://store.steampowered.com/app/730/Counter...,https://steamcdn-a.akamaihd.net/steam/apps/730...,730,Counter-Strike: Global Offensive,21-Aug-12,"['Windows', 'Mac os', 'Linux']","Very Positive87% of the 4,285,604 user reviews...",Free to Play,,
1,https://store.steampowered.com/app/397540/Bord...,https://steamcdn-a.akamaihd.net/steam/apps/397...,397540,Borderlands 3,13-Mar-20,['Windows'],"Very Positive81% of the 26,362 user reviews fo...",$59.99,50%,$29.99
2,https://store.steampowered.com/app/582010/MONS...,https://steamcdn-a.akamaihd.net/steam/apps/582...,582010,MONSTER HUNTER: WORLD,9-Aug-18,['Windows'],"Very Positive82% of the 144,277 user reviews f...",$29.99,34%,$19.79
3,https://store.steampowered.com/app/306130/The_...,https://steamcdn-a.akamaihd.net/steam/apps/306...,306130,The Elder Scrolls® Online,22-May-17,"['Windows', 'Mac os']","Very Positive81% of the 58,008 user reviews fo...",$19.99,,
4,https://store.steampowered.com/app/359550/Tom_...,https://steamcdn-a.akamaihd.net/steam/apps/359...,359550,Tom Clancy's Rainbow Six® Siege,1-Dec-15,['Windows'],"Very Positive88% of the 552,650 user reviews f...",$19.99,,
