In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
# Loads game and recommendation data
games_df = pd.read_csv('games.csv')
recommendation_df = pd.read_csv('recommendations.csv')

# Prints preview
print(f'Preview of Games DataFrame:\n{games_df.head()}\n')
print(f'Preview of Recommendations DataFrame:\n{recommendation_df.head()}\n')

Preview of Games DataFrame:
   app_id                              title date_release   win    mac  linux  \
0   13500  Prince of Persia: Warrior Within™   2008-11-21  True  False  False   
1   22364            BRINK: Agents of Change   2011-08-03  True  False  False   
2  113020       Monaco: What's Yours Is Mine   2013-04-24  True   True   True   
3  226560                 Escape Dead Island   2014-11-18  True  False  False   
4  249050            Dungeon of the ENDLESS™   2014-10-27  True   True  False   

          rating  positive_ratio  user_reviews  price_final  price_original  \
0  Very Positive              84          2199         9.99            9.99   
1       Positive              85            21         2.99            2.99   
2  Very Positive              92          3722        14.99           14.99   
3          Mixed              61           873        14.99           14.99   
4  Very Positive              88          8784        11.99           11.99   

   discoun

In [3]:
# Select relevant fields for comparison
games = games_df[['app_id', 'title', 'rating', 'positive_ratio', 'user_reviews']]
rec = recommendation_df[['app_id', 'hours']]

# Merge datasets on 'app_id'
df = pd.merge(games, rec, on='app_id')
print(df.head())

   app_id                              title         rating  positive_ratio  \
0   13500  Prince of Persia: Warrior Within™  Very Positive              84   
1   13500  Prince of Persia: Warrior Within™  Very Positive              84   
2   13500  Prince of Persia: Warrior Within™  Very Positive              84   
3   13500  Prince of Persia: Warrior Within™  Very Positive              84   
4   13500  Prince of Persia: Warrior Within™  Very Positive              84   

   user_reviews  hours  
0          2199    0.4  
1          2199   15.2  
2          2199   19.4  
3          2199   10.8  
4          2199   10.9  


In [4]:
# Define the fields for comparison
field1 = ['positive_ratio']    # Compare based on user feedback ratio
field2 = ['user_reviews']      # Compare based on the number of user reviews
field3 = ['hours']             # Compare based on hours played

# Convert columns to numeric and converting invalid data to NaN
for col in field1 + field2 + field3:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
# Drop rows with missing values
df = df.dropna(subset=field1 + field2 + field3)
df.reset_index(inplace=True, drop=True)

In [5]:
titles = df[['title']].values.tolist()

target_title = "Counter-Strike: Global Offensive"
target_game_index = df[df['title'] == target_title].index[0]
target_game = df.loc[target_game_index, field1 + field2 + field3].astype(float)


In [6]:
# Calculate distances for positive_ratio
positive_ratio_distances = euclidean_distances(df[field1], [target_game[0:1]])[:, 0]
positive_ratio_query_distances = list(zip(df.index, positive_ratio_distances))

# Calculate distances for user_reviews
user_reviews_distances = euclidean_distances(df[field2], [target_game[1:2]])[:, 0]
user_reviews_query_distances = list(zip(df.index, user_reviews_distances))

# Calculate distances for hours played
hours_distances = euclidean_distances(df[field3], [target_game[2:3]])[:, 0]
hours_query_distances = list(zip(df.index, hours_distances))

In [7]:
# Check for unique names (no duplicate titles)
unique_games_set = set()

# Print most similar games by positive ratio
print("\nMost Similar Games by Positive Ratio:")
for index, distance in sorted(positive_ratio_query_distances, key=lambda x: x[1]):
    if titles[index][0] not in unique_games_set:
        print(f"{titles[index][0]} - Distance: {distance}")
        unique_games_set.add(titles[index][0])
    if len(unique_games_set) >= 10:
        break # Print top 10 unique games
        
# clear for next use
unique_games_set.clear()

# Print most similar games by user reviews
print("\nMost Similar Games by User Reviews:")
for index, distance in sorted(user_reviews_query_distances, key=lambda x: x[1]):
    if titles[index][0] not in unique_games_set:
        print(f"{titles[index][0]} - Distance: {distance}")
        unique_games_set.add(titles[index][0])
    if len(unique_games_set) >= 10:
        break # Print top 10 unique games

# Print most similar games by hours played
print("\nMost Similar Games by Hours Played:")
for index, distance in sorted(hours_query_distances, key=lambda x: x[1])[:10]:
    print(f"{titles[index][0]} - Distance: {distance}")


Most Similar Games by Positive Ratio:
Dungeon of the ENDLESS™ - Distance: 0.0
Osmos - Distance: 0.0
Angvik - Distance: 0.0
Eradicator - Distance: 0.0
Super Mega Baseball 2 - Distance: 0.0
Archon Classic - Distance: 0.0
Drox Operative - Distance: 0.0
Sepia Tears - Distance: 0.0
Demon Hunter 3: Revelation - Distance: 0.0
Old Man's Journey - Distance: 0.0

Most Similar Games by User Reviews:
Counter-Strike: Global Offensive - Distance: 0.0
PUBG: BATTLEGROUNDS - Distance: 5277234.0
Dota 2 - Distance: 5448832.0
Grand Theft Auto V - Distance: 6010338.0
Tom Clancy's Rainbow Six® Siege - Distance: 6501148.0
Team Fortress 2 - Distance: 6508641.0
Terraria - Distance: 6551047.0
Garry's Mod - Distance: 6640727.0
Rust - Distance: 6707792.0
Apex Legends™ - Distance: 6781278.0

Most Similar Games by Hours Played:
Osmos - Distance: 0.0
Barony - Distance: 0.0
Motorsport Manager - Distance: 0.0
THE KING OF FIGHTERS '98 ULTIMATE MATCH FINAL EDITION - Distance: 0.0
Bean Battles - Distance: 0.0
Surviving 

In [8]:
# Checking a different game
target_title = "The Binding of Isaac: Rebirth"
target_game_index = df[df['title'] == target_title].index[0]
target_game = df.loc[target_game_index, field1 + field2 + field3].astype(float)

# Calculate distances for user_reviews
user_reviews_distances = euclidean_distances(df[field2], [target_game[1:2]])[:, 0]
user_reviews_query_distances = list(zip(df.index, user_reviews_distances))

unique_games_set.clear()

# Print most similar games by user reviews
print("\nMost Similar Games by User Reviews:")
for index, distance in sorted(user_reviews_query_distances, key=lambda x: x[1]):
    if titles[index][0] not in unique_games_set:
        print(f"{titles[index][0]} - Distance: {distance}")
        unique_games_set.add(titles[index][0])
    if len(unique_games_set) >= 10:
        break # Print top 10 unique games


Most Similar Games by User Reviews:
The Binding of Isaac: Rebirth - Distance: 0.0
New World - Distance: 3470.0
DARK SOULS™ III - Distance: 4592.0
Subnautica - Distance: 5478.0
Monster Hunter: World - Distance: 7615.0
Hades - Distance: 11548.0
Geometry Dash - Distance: 13266.0
Bloons TD 6 - Distance: 14123.0
Raft - Distance: 14484.0
No Man's Sky - Distance: 15844.0
