In [60]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, lit
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.sql.window import Window
#machine learning
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.recommendation import ALS, ALSModel

from pyspark.sql.types import IntegerType
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
import pandas as pd
from time import time

In [2]:
sc = pyspark.SparkContext('local[*]')
sql_c = SQLContext(sc) 

In [4]:
games = sql_c.read.csv('./all_games.csv',header=True).drop('_c0')

In [5]:
games.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
| 10|      Counter-Strike|
| 20|Team Fortress Cla...|
| 30|       Day of Defeat|
| 40|  Deathmatch Classic|
| 50|Half-Life: Opposi...|
+---+--------------------+
only showing top 5 rows



In [100]:
ratings = sql_c.read.csv('./user_ratings.csv',header=True)
ratings = ratings.filter(col('time_f')!=0).drop('_c0').drop('time_f')
ratings = ratings.na.drop() 

In [101]:
username_remap = ratings.select('user').distinct()
w = Window.orderBy("user")
username_remap = username_remap.select("user", F.row_number().over(w))
rdf = ratings.join(username_remap, username_remap.user == ratings.user)
ratings = rdf.select(col(rdf.schema.names[-1]).alias('user'), 'game', 'rating')

In [102]:
ratings.show(5) 

+----+-----+------------------+
|user| game|            rating|
+----+-----+------------------+
|  56|  220| 0.251373588226222|
|  56|29800| 0.355938927193069|
|  56|  550|0.7221448294731989|
|  56|34270| 0.194557132271704|
|  56|42910|0.7853635740402285|
+----+-----+------------------+
only showing top 5 rows



# Train Test Lit

In [103]:
(training, test) = ratings.select(
    [col(c).cast('float') for c in ratings.columns]
).na.drop().randomSplit([0.8, 0.2])

In [104]:
def fit_als(training,test,reg,rank=10,should_test=True):
    als = ALS(
        maxIter=5,
        regParam=reg, 
        userCol="user", itemCol="game", ratingCol="rating",
        coldStartStrategy="drop",
        implicitPrefs=False,
        rank = rank
    )

    model = als.fit(training)
    
    if should_test == True:
        # Evaluate the model by computing the RMSE on the test data
        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                        predictionCol="prediction")

        rmse = evaluator.evaluate(predictions)
        print("Root-mean-square error = " + str(rmse), "Regularization Parameter:", reg)
    return model

In [106]:
for reg in np.arange(0.01,0.1,0.01):
    fit_als(training,test,reg,rank=5) 

Root-mean-square error = 0.1700715912343747 Regularization Parameter: 0.01
Root-mean-square error = 0.1672929437896569 Regularization Parameter: 0.02
Root-mean-square error = 0.16756545280318474 Regularization Parameter: 0.03
Root-mean-square error = 0.16981067519540283 Regularization Parameter: 0.04
Root-mean-square error = 0.17314498476084672 Regularization Parameter: 0.05
Root-mean-square error = 0.17762697445392028 Regularization Parameter: 0.060000000000000005
Root-mean-square error = 0.18309167253872816 Regularization Parameter: 0.06999999999999999
Root-mean-square error = 0.18868554910324853 Regularization Parameter: 0.08
Root-mean-square error = 0.19380142703294706 Regularization Parameter: 0.09


In [120]:
def recommend_games(game_list,ratings_df,game_df):
#get game ids
    game_ids = []
    for game in game_list:
        game_ids.append(
            game_df
            .filter(
                game_df.name.like('%'+game+'%')
            )
                #col('name')==game)
            .select('id')
            .rdd.flatMap(list).first()
        )

    #build dataframe
    user_id = 9999999

    my_ratings = [(user_id,game_id,1) for game_id in game_ids]

    my_ratings_df = sql_c.createDataFrame(my_ratings,['user','game','rating'])

    #my_ratings_df.show()
    #join
    training_with_my_ratings_df = ratings_df.union(my_ratings_df)

    als = ALS(
        maxIter=5,
        regParam=0.04, 
        userCol="user", itemCol="game", ratingCol="rating",
        coldStartStrategy="drop",
        implicitPrefs=False,
        rank = 10
    )
    
    df = training_with_my_ratings_df.select(
            [col(c).cast('float') for c in ratings.columns]
        ).na.drop()
    
    model = als.fit(df)
    
    recommended_games = sql_c.createDataFrame(
        model.recommendForUserSubset(
            df.filter(col('user')==user_id).select('user'), 10
        ).select('recommendations').rdd.flatMap(list).first()
    )
    
    return recommended_games.join(
        game_df,
        recommended_games.game==game_df.id
    ).select(['name','rating']).toPandas() 

In [124]:
recommend_games(['Counter-Strike','Rocket League'],ratings,games) 

Unnamed: 0,name,rating
0,Major\Minor - EA,1.462579
1,Construct 2 Personal,1.387183
2,Football Manager 2015,1.321476
3,Football Manager 2016,1.283535
4,Counter-Strike: Global Offensive,1.278712
5,Football Manager 2013,1.275516
6,Football Manager 2012,1.264505
7,MLB 2K12,1.264489
8,Football Manager 2014,1.251777
9,MAGIX Movie Edit Pro 2013 Plus,1.206161


In [125]:
recommend_games(['DOOM','Turok',],ratings,games) 

Unnamed: 0,name,rating
0,Construct 2 Personal,1.572344
1,Major\Minor - EA,1.480731
2,Counter-Strike: Global Offensive,1.401852
3,Maya LT,1.396167
4,Grand Theft Auto V,1.351077
5,Siralim 2,1.343995
6,Football Manager 2015,1.343987
7,Football Manager 2016,1.325262
8,MAGIX Movie Edit Pro 2013 Plus,1.322966
9,Don Bradman Cricket 14,1.314129


In [123]:
recommend_games(['GTA','Elder Scrolls V: Skyrim',],ratings,games) 

Unnamed: 0,name,rating
0,Hounds: The Last Hope,1.416453
1,Construct 2 Personal,1.395079
2,Square Arena,1.367385
3,Root Double -Before Crime * After Days- Xtend ...,1.360764
4,Getsuei Gakuen -kou-,1.359375
5,Campus Notes - forget me not.,1.358148
6,LostWinds 2: Winter of the Melodias,1.351925
7,The Beggar's Ride,1.351219
8,Borstal,1.343642
9,Empty Soul - S&S Edition,1.338934


In [None]:
#Next Steps: Go back and mess with implicit ratings, and check ideal distributions for ALS