# Hybrid Model using game tags

First only content based filtering

In [1]:
#get the data 
csvPath = "../Datasets/dataWithRatings.csv"

In [2]:
import pandas as pd
df = pd.read_csv (csvPath)


### Select a random user

In [5]:
#select a random user that has played more that minHours #JUST LIKE LAB1
minHours = 500
df['hours'] = df['hours'].str.replace(',', '.')
df['hours'] = df['hours'].astype(float)
users_ratings = df.groupby('user_id').hours.agg(['sum']).reindex(df.user_id).reset_index()
# users_ratings = df.groupby('user_id').total.agg(['sum']).reindex(df.idx).reset_index()
display(users_ratings)
selected = users_ratings['sum'] > minHours

selected_users = df.loc[selected]

random_selected = selected_users.sample() 
select_column_df = random_selected.reset_index()['user_id'] 
selected_user = select_column_df.iloc[0] 
print("Selected user: " + str(selected_user))

Unnamed: 0,user_id,sum
0,151603712,134.5
1,87445402,87.7
2,25096601,208.0
3,211925330,848.0
4,115396529,365.7
...,...,...
36252,154230723,923.1
36253,116564064,489.0
36254,135400225,1205.5
36255,135400225,1205.5


Selected user: 11373749


### Rated games by the user

In [6]:
def get_rated_games_user(selected_user):
    selected_user_ratings = df.loc[df['user_id'] == selected_user]
    # selected_user_ratings = selected_user_ratings.sort_values(by='hours', ascending=True)
#     print("Rated games: " + str(selected_user_ratings.shape[0]))
    # display(selected_user_ratings.head(10))
    rated_games_df = selected_user_ratings

    rated_games_df = rated_games_df[['name', 'game_description', 'Rating M1', 'Rating M2']]

#     display(rated_games_df.head(10))    
    return rated_games_df


### Unrated games by the user

In [7]:
def get_unrated_games_user(rated_games_df):
    diff = set(df.index) - set(rated_games_df.index)
    unrated_games_df = df.loc[diff]
    unrated_games_df = unrated_games_df[['name', 'game_description']]
    unrated_games_df = unrated_games_df.drop_duplicates(subset ="name") #Had to remove duplicates 
#     print("Unrated games: " + .str(unrated_games_df.shape[0]))
#     display(unrated_games_df.head(10))
    return unrated_games_df

## Content Based Filtering
#### Remove users who dont have at least 30 games rated

In [9]:
import numpy as np
users = np.array(selected_users['user_id'])
unique, counts = np.unique(users, return_counts=True)
main_users =list()
for i, val in enumerate(unique):
    if counts[i] >= 30:
        main_users.append(val)

#### Train KNN for each user using Rating M1 and Rating M2 seperately for all users

In [10]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None  # default='warn'
# initialise the vectorizer
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
# vectorizer = CountVectorizer()
full_predictions = pd.DataFrame()
# for each gamer in the selected user list (min gameplay time of 300
for gamer in main_users:
    rated_games_df = get_rated_games_user(gamer)
#     display(rated_games_df)
    unrated_games_df = get_unrated_games_user(rated_games_df)
    try:
        X = vectorizer.fit_transform(rated_games_df['game_description'])
        y = rated_games_df['Rating M1']
        indices = np.arange(X.shape[0])
        (
            X_train,
            X_test,
            y_train,
            y_test,
            indices_train,
            indices_test,
        ) = train_test_split(X, y, indices, test_size=0.3, random_state=101)
        neigh = KNeighborsRegressor(n_neighbors=10)
        neigh.fit(X_train, y_train)
    
        X_unrated = vectorizer.transform(unrated_games_df['game_description'].values.astype('U'))
    
        # print(X_unrated.shape)
        #predicting already existing rated values 
        y_pred1 = neigh.predict(X_test)
        #predicting unrated games
        y_pred1_unrated = neigh.predict(X_unrated)
#         print(y_pred1.shape)

        unrated_games_df['predicted_ratings_KNN M1'] = y_pred1_unrated
        tt = rated_games_df.iloc[indices_test]
        tt['y_test M1'] = y_pred1

        y2 = rated_games_df['Rating M2']
        indices2 = np.arange(X.shape[0])
        (
            X_train2,
            X_test2,
            y_train2,
            y_test2,
            indices_train2,
            indices_test2,
        ) = train_test_split(X, y2, indices2, test_size=0.3, random_state=101)
        neigh2 = KNeighborsRegressor(n_neighbors=10)
        neigh2.fit(X_train, y_train2)
    
    
        y_unrated2 = neigh2.predict(X_unrated)
        y_pred2 = neigh2.predict(X_test)

        unrated_games_df['predicted_ratings_KNN M2'] = y_unrated2
        unrated_games_df_KNN_ordering = unrated_games_df.sort_values(by='predicted_ratings_KNN M2', ascending=False)
    #     display(unrated_games_df_KNN_ordering.head(10))

        tt['y_test M2'] = y_pred2
        full_predictions = full_predictions.append(tt)
    except:
        continue
#     tt = tt.sort_values(by='y_test M2', ascending=False)
#     display(tt.head(10))

In [11]:
display(full_predictions)

Unnamed: 0,name,game_description,Rating M1,Rating M2,y_test M1,y_test M2
22578,The Walking Dead,About This Series When the world is ravaged b...,4,5,1.9,2.4
5444,Dota 2,About This Game The most-played game on Stea...,1,1,2.3,2.5
22670,The Testament of Sherlock Holmes,"About This Game YOU ARE SHERLOCK HOLMES, AND...",1,1,1.7,2.7
22983,Surgeon Simulator,About This Game Malpractice Makes Perfect. D...,1,1,1.7,1.7
22901,"Papers, Please",About This Game Congratulations. The October...,1,2,1.2,1.9
...,...,...,...,...,...,...
27067,The Forest,About This Game As the lone survivor of a pas...,5,5,2.6,3.0
28289,Cubic Castles,About This Game 3D Platforming Meets Open Wo...,1,1,2.6,2.8
16548,Outlast,About This Game Hell is an experiment you can...,2,5,2.3,2.0
26752,Killing Floor 2,"About This Game In KILLING FLOOR 2, players d...",1,1,1.8,2.2


In [12]:
from sklearn.metrics import mean_squared_error
rmse_M1 = mean_squared_error(tt['Rating M1'], tt['y_test M1'])
rmse_M2 = mean_squared_error(tt['Rating M2'], tt['y_test M2'])
print('RMSE for M1: ' + str(rmse_M1))
print('RMSE for M2: ' + str(rmse_M2))

RMSE for M1: 2.3615000000000004
RMSE for M2: 2.854


## Collaborative Filtering 
#### UserUser with Lenskit

In [29]:
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
ratings_df1 = df[['user_id','name','Rating M1']]
ratings_df1 = ratings_df1.rename(columns={'user_id':'user', 'name': 'item', 'Rating M1': 'rating'})
display(ratings_df1)
num_recs = 10  # Number of recommmendations to generate
user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(ratings_df1)

Unnamed: 0,user,item,rating
0,151603712,Fallout 4,5
1,87445402,Fallout 4,5
2,25096601,Fallout 4,1
3,211925330,Fallout 4,3
4,115396529,Fallout 4,1
...,...,...,...
36252,154230723,Khet 2.0,1
36253,116564064,SAMURAI WARRIORS 4-II,1
36254,135400225,Life is Hard,1
36255,135400225,Executive Assault,1


<lenskit.algorithms.ranking.TopN at 0x12228542c70>

In [30]:
selected_games_useruser1 = recsys.recommend(selected_user, 10) # generate 10 recommendations for the selected user 

display(selected_games_useruser1)

Unnamed: 0,item,score
0,Fallout 4,4.677644
1,DeathSpank,4.656558
2,Fairy Fencer F,4.642703
3,EVE Online,4.621423
4,Impire,4.360516
5,Heroes of Scene,4.227792
6,Holy Potatoes! A Weapon Shop?!,4.187592
7,Kerbal Space Program,4.181591
8,Grim Dawn,4.161434
9,Elite Dangerous,4.072839


In [45]:
temp1 = unrated_games_df_KNN_ordering[['name','predicted_ratings_KNN M1']]
temp1 = temp1.rename(columns={'name':'item'})
temp1 = pd.merge(selected_games_useruser1, temp1, on ='item') 
temp1['Weighted Score'] = 0.5*temp1['score']  +  0.5*temp1['predicted_ratings_KNN M1']
final_pred1 = temp1[['item','Weighted Score']]
display(final_pred1)


Unnamed: 0,item,Weighted Score
0,Fallout 4,3.938822
1,DeathSpank,4.028279
2,Fairy Fencer F,3.621351
3,EVE Online,4.210711
4,Impire,3.080258
5,Heroes of Scene,3.913896
6,Holy Potatoes! A Weapon Shop?!,3.893796
7,Kerbal Space Program,3.790795
8,Grim Dawn,3.980717
9,Elite Dangerous,3.43642
