# Hybrid Model using game tags

First only content based filtering

In [2]:
#get the data 
csvPath = "../Datasets/dataWithRatings.csv"

In [3]:
import pandas as pd
df = pd.read_csv (csvPath)

### Select a random user

In [4]:
#select a random user that has played more that minHours #JUST LIKE LAB1
minHours = 500
df['hours'] = df['hours'].str.replace(',', '.')
df['hours'] = df['hours'].astype(float)
users_ratings = df.groupby('user_id').hours.agg(['sum']).reindex(df.user_id).reset_index()
# users_ratings = df.groupby('user_id').total.agg(['sum']).reindex(df.idx).reset_index()
display(users_ratings)
selected = users_ratings['sum'] > minHours

selected_users = df.loc[selected]

random_selected = selected_users.sample() 
select_column_df = random_selected.reset_index()['user_id'] 
selected_user = select_column_df.iloc[0] 
print("Selected user: " + str(selected_user))

Unnamed: 0,user_id,sum
0,151603712,134.5
1,87445402,87.7
2,25096601,208.0
3,211925330,848.0
4,115396529,365.7
...,...,...
36252,154230723,923.1
36253,116564064,489.0
36254,135400225,1205.5
36255,135400225,1205.5


Selected user: 5990132


### Rated games by the user

In [5]:
def get_rated_games_user(selected_user):
    selected_user_ratings = df.loc[df['user_id'] == selected_user]
    # selected_user_ratings = selected_user_ratings.sort_values(by='hours', ascending=True)
#     print("Rated games: " + str(selected_user_ratings.shape[0]))
    # display(selected_user_ratings.head(10))
    rated_games_df = selected_user_ratings

    rated_games_df = rated_games_df[['name', 'game_description', 'Rating M1', 'Rating M2']]

#     display(rated_games_df.head(10))    
    return rated_games_df


### Unrated games by the user

In [6]:
def get_unrated_games_user(rated_games_df):
    diff = set(df.index) - set(rated_games_df.index)
    unrated_games_df = df.loc[diff]
    unrated_games_df = unrated_games_df[['name', 'game_description']]
    unrated_games_df = unrated_games_df.drop_duplicates(subset ="name") #Had to remove duplicates 
#     print("Unrated games: " + .str(unrated_games_df.shape[0]))
#     display(unrated_games_df.head(10))
    return unrated_games_df

## Content Based Filtering
#### Remove users who dont have at least 30 games rated

In [7]:
import numpy as np
users = np.array(selected_users['user_id'])
unique, counts = np.unique(users, return_counts=True)
main_users =list()
for i, val in enumerate(unique):
    if counts[i] >= 100:
        main_users.append(val)

#### Train KNN for each user using Rating M1 and Rating M2 seperately for all users

In [24]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression

pd.options.mode.chained_assignment = None  # default='warn'
# initialise the vectorizer
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
# vectorizer = CountVectorizer()
full_predictions = pd.DataFrame()
# for each gamer in the selected user list (min gameplay time of 300)
for gamer in main_users:
    rated_games_df = get_rated_games_user(gamer)
#     display(rated_games_df)
    unrated_games_df = get_unrated_games_user(rated_games_df)
        
    try:
        X = vectorizer.fit_transform(rated_games_df['game_description'])
        y = rated_games_df['Rating M1']
        indices = np.arange(X.shape[0])
        (
            X_train,
            X_test,
            y_train,
            y_test,
            indices_train,
            indices_test,
        ) = train_test_split(X, y, indices, test_size=0.3, random_state=101)

        linReg = LinearRegression().fit(X_train, y_train)
        neigh = KNeighborsRegressor(n_neighbors=14).fit(X_train, y_train)
        nearCentroid = NearestCentroid().fit(X_train, y_train)
        lr = LogisticRegression().fit(X_train, y_train)
    
        X_unrated = vectorizer.transform(unrated_games_df['game_description'].values.astype('U'))

        # predicting already existing rated values 
        y_pred1_reg = linReg.predict(X_test)
        y_pred1_neigh = neigh.predict(X_test)
        y_pred1_nearCentroid = nearCentroid.predict(X_test)
        y_pred1_lr = lr.predict(X_test)


        # predicting unrated games
        y_pred1_unrated_reg = linReg.predict(X_unrated)
        y_pred1_unrated_neigh = neigh.predict(X_unrated)
        y_pred1_unrated_nearCentroid = nearCentroid.predict(X_unrated)
        y_pred1_unrated_lr = lr.predict(X_unrated)


        unrated_games_df['predicted_ratings_KNN M1'] = y_pred1_unrated_neigh
        unrated_games_df['predicted_ratings_Reg M1'] = y_pred1_unrated_reg
        unrated_games_df['predicted_ratings_nearCent M1'] = y_pred1_unrated_nearCentroid
        unrated_games_df['predicted_ratings_lr M1'] = y_pred1_unrated_lr


        tt = rated_games_df.iloc[indices_test]

        tt['y_test_KNN M1'] = y_pred1_neigh
        tt['y_test_Reg M1'] = y_pred1_reg
        tt['y_test_nearCent M1'] = y_pred1_nearCentroid
        tt['y_test_lr M1'] = y_pred1_lr


        #                                                   ''' Second Algorithms '''

        y2 = rated_games_df['Rating M2']
        indices2 = np.arange(X.shape[0])
        (
            X_train2,
            X_test2,
            y_train2,
            y_test2,
            indices_train2,
            indices_test2,
        ) = train_test_split(X, y2, indices2, test_size=0.3, random_state=101)

        linReg2 = LinearRegression().fit(X_train, y_train2)
        neigh2 = KNeighborsRegressor(n_neighbors=14).fit(X_train, y_train2)
        nearCentroid2 = NearestCentroid().fit(X_train, y_train2)
        lr2 = LogisticRegression().fit(X_train, y_train2)

        y_unrated2_Reg = linReg2.predict(X_unrated)
        y_pred2_Reg = linReg2.predict(X_test)
    
        y_unrated2_neigh = neigh2.predict(X_unrated)
        y_pred2_neigh = neigh2.predict(X_test)

        y_unrated2_nearCentroid = nearCentroid2.predict(X_unrated)
        y_pred2_nearCentroid = nearCentroid2.predict(X_test)

        y_unrated2_lr2 = lr2.predict(X_unrated)
        y_pred2_lr2 = lr2.predict(X_test)

        unrated_games_df['predicted_ratings_KNN M2'] = y_unrated2_neigh
        unrated_games_df['predicted_ratings_Reg M2'] = y_unrated2_Reg
        unrated_games_df['predicted_ratings_nearCent M2'] = y_unrated2_nearCentroid
        unrated_games_df['predicted_ratings_lr2 M2'] = y_unrated2_lr2

        
        unrated_games_df_KNN_ordering = unrated_games_df.sort_values(by='predicted_ratings_KNN M2', ascending=False)
        unrated_games_df_Reg_ordering = unrated_games_df.sort_values(by='predicted_ratings_Reg M2', ascending=False)
        unrated_games_df_nearCent_ordering = unrated_games_df.sort_values(by='predicted_ratings_nearCent M2', ascending=False)
        unrated_games_df_lr2_ordering = unrated_games_df.sort_values(by='predicted_ratings_lr2 M2', ascending=False)


    #     display(unrated_games_df_KNN_ordering.head(10))

        tt['y_test_KNN M2'] = y_pred2_neigh
        tt['y_test_Reg M2'] = y_pred2_Reg
        tt['y_test_nearCent M2'] = y_pred2_nearCentroid
        tt['y_test_lr2 M2'] = y_pred2_lr2

        full_predictions = full_predictions.append(tt)
        
    except:
        continue

In [27]:
display(full_predictions)

Unnamed: 0,name,game_description,Rating M1,Rating M2,y_test_KNN M1,y_test_Reg M1,y_test_nearCent M1,y_test_lr M1,y_test_KNN M2,y_test_Reg M2,y_test_nearCent M2,y_test_lr2 M2
26624,Stronghold HD,"About This Game The original castle sim, Stro...",1,2,1.571429,1.414380,1,1,2.571429,2.794980,5,5
15930,Race The Sun,About This Game You are a solar craft. The su...,1,2,1.642857,1.476465,1,1,3.428571,3.116645,5,5
21334,The Ball,About This Game The Ball is a first person ac...,1,1,1.642857,1.551172,1,1,3.142857,3.279148,5,5
32272,Screencheat,About This Game Screencheat is the ridiculous...,1,5,2.000000,1.552082,1,1,2.285714,3.143551,5,5
34374,Ironclad Tactics,About This Game Ironclad Tactics is a fast-p...,1,5,1.714286,1.634706,1,1,2.785714,3.341119,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
16260,Out There Somewhere,About This Game From the artists behind the s...,2,5,2.285714,2.616428,2,2,3.928571,4.221632,5,5
16649,Spelunky,About This Game Spelunky is a unique platform...,3,2,2.642857,2.666732,2,2,3.428571,4.044889,5,5
21421,Torchlight II,About This Game The award-winning Action RPG ...,5,3,2.714286,3.020324,2,2,2.714286,3.826765,5,5
35556,The Nightmare Cooperative,About This Game The Nightmare Cooperative is ...,2,5,2.500000,2.662275,2,2,3.928571,4.131349,5,5


In [29]:
from sklearn.metrics import mean_squared_error

rmse_M1_KNN = mean_squared_error(tt['Rating M1'], tt['y_test_KNN M1'])
rmse_M2_KNN = mean_squared_error(tt['Rating M2'], tt['y_test_KNN M2'])

rmse_M1_Reg = mean_squared_error(tt['Rating M1'], tt['y_test_Reg M1'])
rmse_M2_Reg = mean_squared_error(tt['Rating M2'], tt['y_test_Reg M2'])

rmse_M1_nearCent = mean_squared_error(tt['Rating M1'], tt['y_test_nearCent M1'])
rmse_M2_nearCent = mean_squared_error(tt['Rating M2'], tt['y_test_nearCent M2'])

rmse_M1_lr = mean_squared_error(tt['Rating M1'], tt['y_test_lr M1'])
rmse_M2_lr = mean_squared_error(tt['Rating M2'], tt['y_test_lr M2'])

print('RMSE for KNN M1: ' + str(rmse_M1_KNN))
print('RMSE for KNN M2: ' + str(rmse_M2_KNN))

print('RMSE for Reg M1: ' + str(rmse_M1_Reg))
print('RMSE for Reg M2: ' + str(rmse_M2_Reg))

print('RMSE for nearCent M1: ' + str(rmse_M1_nearCent))
print('RMSE for nearCent M2: ' + str(rmse_M2_nearCent))

print('RMSE for lr M1: ' + str(rmse_M1_lr))
print('RMSE for lr M2: ' + str(rmse_M2_lr))

RMSE for KNN M1: 1.4399955634427684
RMSE for KNN M2: 1.4894631765749777
RMSE for Reg M1: 1.4552917860446244
RMSE for Reg M2: 1.424254807970831
RMSE for nearCent M1: 1.673913043478261
RMSE for nearCent M2: 2.1739130434782608


## Collaborative Filtering 
#### UserUser with Lenskit

In [10]:
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
ratings_df1 = df[['user_id','name','Rating M1']]
ratings_df1 = ratings_df1.rename(columns={'user_id':'user', 'name': 'item', 'Rating M1': 'rating'})
display(ratings_df1)
num_recs = 10  # Number of recommmendations to generate
user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(ratings_df1)

ModuleNotFoundError: No module named 'lenskit'

In [None]:
selected_games_useruser1 = recsys.recommend(selected_user, 10) # generate 10 recommendations for the selected user 

display(selected_games_useruser1)

Unnamed: 0,item,score
0,Farming Simulator 2011,4.828475
1,Guild Wars,4.765201
2,EVE Online,4.731073
3,Broken Sword 5 - the Serpent's Curse,4.410861
4,Fallout 4,4.23826
5,Kingdom Rush,4.149742
6,Knights of Honor,4.087136
7,Evil Genius,4.070866
8,Software Inc.,3.919878
9,DRAGON BALL XENOVERSE,3.909945


In [None]:
temp1 = unrated_games_df_KNN_ordering[['name','predicted_ratings_KNN M1']]
temp1 = temp1.rename(columns={'name':'item'})
temp1 = pd.merge(selected_games_useruser1, temp1, on ='item') 
temp1['Weighted Score'] = 0.5*temp1['score']  +  0.5*temp1['predicted_ratings_KNN M1']
final_pred1 = temp1[['item','Weighted Score']]
display(final_pred1)


Unnamed: 0,item,Weighted Score
0,Farming Simulator 2011,3.564237
1,Guild Wars,3.7326
2,EVE Online,3.615536
3,Broken Sword 5 - the Serpent's Curse,3.80543
4,Fallout 4,3.51913
5,Kingdom Rush,3.374871
6,Knights of Honor,3.243568
7,Evil Genius,3.635433
8,Software Inc.,3.159939
9,DRAGON BALL XENOVERSE,3.354973
