In [58]:

import os
import zipfile
import json

# library for data processing
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

# library to make the recommendation system model
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# library for evaluate the machine learning model
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

In [59]:
data = pd.read_csv("games_cleaned.csv")
data

Unnamed: 0,id,title,genre,release_year,popularity,developer,publisher,price_per_day,rating
0,1,Dead or Alive: Dimensions,Fighting,2011,8.2,Team Ninja,Ubisoft Annecy,8654,T
1,2,Yaiba: Ninja Gaiden Z,Action,2014,4.5,"Spark Unlimited, comcept",Tecmo Koei,6113,M
2,3,Dynasty Warriors Gundam,Action,2007,6.8,"Koei, Omega Force",Namco Bandai Games,9033,T
3,4,Kengo: Master of Bushido,Fighting,2000,8.4,Light Weight,Ubisoft,5044,M
4,5,Watch Dogs,Action,2014,6.4,Ubisoft Romania,Ubisoft,8578,M
...,...,...,...,...,...,...,...,...,...
995,996,Motocross Mania 3,Racing,2005,3.6,Deibus Studios,Take-Two Interactive,5103,T
996,997,Rocky Balboa,Sports,2007,6.6,Ubisoft,Ubisoft,8502,T
997,998,Lemony Snicket's A Series of Unfortunate Events,Platform,2004,8.0,Griptonite Games,Activision,6760,E
998,999,The Settlers: Rise of an Empire,Strategy,2007,7.7,Blue Byte,Ubisoft,6903,E10+


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   int64  
 1   title          1000 non-null   object 
 2   genre          1000 non-null   object 
 3   release_year   1000 non-null   int64  
 4   popularity     1000 non-null   float64
 5   developer      1000 non-null   object 
 6   publisher      1000 non-null   object 
 7   price_per_day  1000 non-null   int64  
 8   rating         1000 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 70.4+ KB


In [61]:
data.isnull().sum()

id               0
title            0
genre            0
release_year     0
popularity       0
developer        0
publisher        0
price_per_day    0
rating           0
dtype: int64

In [62]:
data = data.dropna()

In [63]:
data.isnull().sum()

id               0
title            0
genre            0
release_year     0
popularity       0
developer        0
publisher        0
price_per_day    0
rating           0
dtype: int64

In [64]:
data = data.reset_index(drop=True)
data

Unnamed: 0,id,title,genre,release_year,popularity,developer,publisher,price_per_day,rating
0,1,Dead or Alive: Dimensions,Fighting,2011,8.2,Team Ninja,Ubisoft Annecy,8654,T
1,2,Yaiba: Ninja Gaiden Z,Action,2014,4.5,"Spark Unlimited, comcept",Tecmo Koei,6113,M
2,3,Dynasty Warriors Gundam,Action,2007,6.8,"Koei, Omega Force",Namco Bandai Games,9033,T
3,4,Kengo: Master of Bushido,Fighting,2000,8.4,Light Weight,Ubisoft,5044,M
4,5,Watch Dogs,Action,2014,6.4,Ubisoft Romania,Ubisoft,8578,M
...,...,...,...,...,...,...,...,...,...
995,996,Motocross Mania 3,Racing,2005,3.6,Deibus Studios,Take-Two Interactive,5103,T
996,997,Rocky Balboa,Sports,2007,6.6,Ubisoft,Ubisoft,8502,T
997,998,Lemony Snicket's A Series of Unfortunate Events,Platform,2004,8.0,Griptonite Games,Activision,6760,E
998,999,The Settlers: Rise of an Empire,Strategy,2007,7.7,Blue Byte,Ubisoft,6903,E10+


In [65]:
# save game names on new dataframe
df_game_name = pd.DataFrame({'Game': data['title']}).reset_index(drop=True)
df_game_name.head()

Unnamed: 0,Game
0,Dead or Alive: Dimensions
1,Yaiba: Ninja Gaiden Z
2,Dynasty Warriors Gundam
3,Kengo: Master of Bushido
4,Watch Dogs


In [66]:
# use name column as index
data.set_index('title', inplace=True)
data.head()

Unnamed: 0_level_0,id,genre,release_year,popularity,developer,publisher,price_per_day,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Dead or Alive: Dimensions,1,Fighting,2011,8.2,Team Ninja,Ubisoft Annecy,8654,T
Yaiba: Ninja Gaiden Z,2,Action,2014,4.5,"Spark Unlimited, comcept",Tecmo Koei,6113,M
Dynasty Warriors Gundam,3,Action,2007,6.8,"Koei, Omega Force",Namco Bandai Games,9033,T
Kengo: Master of Bushido,4,Fighting,2000,8.4,Light Weight,Ubisoft,5044,M
Watch Dogs,5,Action,2014,6.4,Ubisoft Romania,Ubisoft,8578,M


In [67]:
# select all columns with datatype object
column_object = data.dtypes[data.dtypes == 'object'].keys()
column_object

one_hot_label = pd.get_dummies(data[column_object]).astype(int)
one_hot_label.head(3)

Unnamed: 0_level_0,genre_Action,genre_Adventure,genre_Fighting,genre_Misc,genre_Platform,genre_Puzzle,genre_Racing,genre_Role-Playing,genre_Shooter,genre_Simulation,...,publisher_Vivendi Games,publisher_Warner Bros. Interactive Entertainment,publisher_White Park Bay Software,publisher_Zoo Digital Publishing,publisher_Zoo Games,publisher_id Software,rating_E,rating_E10+,rating_M,rating_T
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dead or Alive: Dimensions,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Yaiba: Ninja Gaiden Z,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Dynasty Warriors Gundam,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
# delete column with data type object
data.drop(column_object,axis=1,inplace=True)
data.head()

Unnamed: 0_level_0,id,release_year,popularity,price_per_day
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dead or Alive: Dimensions,1,2011,8.2,8654
Yaiba: Ninja Gaiden Z,2,2014,4.5,6113
Dynasty Warriors Gundam,3,2007,6.8,9033
Kengo: Master of Bushido,4,2000,8.4,5044
Watch Dogs,5,2014,6.4,8578


In [69]:
# unify one-hot encoding data with whole data
data = pd.concat([data,one_hot_label],axis=1)
data.head()

Unnamed: 0_level_0,id,release_year,popularity,price_per_day,genre_Action,genre_Adventure,genre_Fighting,genre_Misc,genre_Platform,genre_Puzzle,...,publisher_Vivendi Games,publisher_Warner Bros. Interactive Entertainment,publisher_White Park Bay Software,publisher_Zoo Digital Publishing,publisher_Zoo Games,publisher_id Software,rating_E,rating_E10+,rating_M,rating_T
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dead or Alive: Dimensions,1,2011,8.2,8654,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Yaiba: Ninja Gaiden Z,2,2014,4.5,6113,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Dynasty Warriors Gundam,3,2007,6.8,9033,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Kengo: Master of Bushido,4,2000,8.4,5044,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Watch Dogs,5,2014,6.4,8578,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [70]:
column_numeric = list(data.dtypes[data.dtypes == 'float64'].keys())
column_numeric

scaler = MinMaxScaler()
scaled = scaler.fit_transform(data[column_numeric])
i=0
for column in column_numeric:
    data[column] = scaled[:,i]
    i += 1
data.head()

Unnamed: 0_level_0,id,release_year,popularity,price_per_day,genre_Action,genre_Adventure,genre_Fighting,genre_Misc,genre_Platform,genre_Puzzle,...,publisher_Vivendi Games,publisher_Warner Bros. Interactive Entertainment,publisher_White Park Bay Software,publisher_Zoo Digital Publishing,publisher_Zoo Games,publisher_id Software,rating_E,rating_E10+,rating_M,rating_T
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dead or Alive: Dimensions,1,2011,0.844444,8654,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Yaiba: Ninja Gaiden Z,2,2014,0.433333,6113,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Dynasty Warriors Gundam,3,2007,0.688889,9033,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Kengo: Master of Bushido,4,2000,0.866667,5044,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Watch Dogs,5,2014,0.644444,8578,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [71]:
data.to_csv("games_encoded.csv")

In [72]:
model = NearestNeighbors(metric='euclidean')

# Fit model to the data
model.fit(data)

In [73]:
# function to get the game recommendation
def GameRecommend(gamename:str, recommended_games:int=4):
  print(f'If user like playing Game: \n{gamename[0]}\n5 Game that the user might like to play:')
  distances, neighbors = model.kneighbors(data.loc[gamename],n_neighbors=recommended_games)
  similar_game = []
  for gamename in df_game_name.loc[neighbors[0][:]].values:
    similar_game.append(gamename[0])
  similar_distance = []
  for distance in distances[0]:
    similar_distance.append(f"{round(100-distance, 2)}%")
  return pd.DataFrame(data = {"Game" : similar_game[1:], "Similarity" : similar_distance[1:]})

In [74]:
GameRecommend(df_game_name.loc[110])

If user like playing Game: 
The Evil Within
5 Game that the user might like to play:


  print(f'If user like playing Game: \n{gamename[0]}\n5 Game that the user might like to play:')


Unnamed: 0,Game,Similarity
0,Hello Kitty: Roller Rescue,56.78%
1,The Hobbit,55.34%
2,FIFA Soccer 10,54.69%
