## Steam Game Review Analysis

In [1]:
from pyspark.sql import SparkSession, functions, types, Row
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
spark = SparkSession.builder.appName('Steam Games Analysis').getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/30 22:46:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/30 22:46:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/11/30 22:46:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Steam Game Content-Based Recommender

In [3]:
# pip install parquet
import parquet

In [4]:
# pip install pyarrow

In [11]:
df_recommend = pd.read_parquet('cleaned_steam_games', engine='pyarrow')

In [14]:
df_rec = df_recommend[['game_id', 'name', 'desc_snippet', 'popular_tags', 'developers', 'game_description']].fillna('')
df_rec["developers"] = df_rec["developers"].apply(lambda x: " ".join(map(str, x)))
df_rec['popular_tags'] = df_rec['popular_tags'].str.replace(',', ' ')

In [18]:
# read game_ids.txt into a column in a dataframe
game_ids = pd.read_csv("game_ids.txt", sep=",", header=None)

# covert to rows
game_ids = game_ids.transpose()

In [17]:
df_rec.game_id = df_rec.game_id.astype(int)
df_rec = df_rec[df_rec.game_id.isin(game_ids[0])]
# replace the index with default index
df_rec = df_rec.reset_index(drop=True)
df_rec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rec.game_id = df_rec.game_id.astype(int)


Unnamed: 0,game_id,name,desc_snippet,popular_tags,developers,game_description
0,218620,PAYDAY 2,"PAYDAY 2 is an action-packed, four-player co-o...",Co-op Action FPS Heist Online Co-Op Stealth Mu...,OVERKILL - a Starbreeze Studio.,""" About This Game PAYDAY 2: VR is now availab..."
1,613100,House Flipper,House Flipper is a unique chance to become a o...,Simulation Building Singleplayer Realistic Des...,Empyrean,""" About This Game House Flipper is a unique c..."
2,753650,Due Process,Due Process is a tactical FPS about planning a...,Action Tactical FPS Multiplayer Procedural Gen...,Giant Enemy Crab,""" About This Game Due Process is a tactical F..."
3,255710,Cities: Skylines,Cities: Skylines is a modern take on the class...,City Builder Simulation Building Management St...,Colossal Order Ltd.,About This Game Cities: Skylines is a modern...
4,606280,Darksiders III,Return to an apocalyptic Earth in Darksiders I...,Hack and Slash Action Female Protagonist Adven...,Gunfire Games,About This Game Return to an apocalyptic Ear...
...,...,...,...,...,...,...
276,250320,The Wolf Among Us,From the makers of the 2012 Game of the Year: ...,Adventure Detective Story Rich Episodic Point ...,Telltale Games,About This Game From the makers of the 2012 G...
277,239140,Dying Light,First-person action survival game set in a pos...,Zombies Survival Open World Parkour Co-op Mult...,Techland,About This Game From the creators of hit titl...
278,489830,The Elder Scrolls V: Skyrim Special Edition,Winner of more than 200 Game of the Year Award...,Open World RPG Adventure Singleplayer Fantasy ...,Bethesda Game Studios,About This Game Winner of more than 200 Game ...
279,292730,Call of Duty®: Infinite Warfare,Infinite Warfare delivers three unique game mo...,Action FPS Multiplayer Futuristic Zombies Spac...,Infinity Ward,About This Game Includes the Terminal Bonus ...


In [20]:
# combine features
def combined_features(row):
    return row['desc_snippet']+" "+row['game_description']+" "+row['popular_tags']+" "+row['developers']

df_rec['combined_features'] = df_rec.apply(combined_features,axis = 1)

In [50]:
# save df_rec to parquet
df_rec.to_parquet('df_rec.parquet', engine='pyarrow')
# read df_rec.parquet
df_rec = pd.read_parquet('df_rec.parquet', engine='pyarrow')

23/12/01 05:06:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1063718 ms exceeds timeout 120000 ms
23/12/01 05:06:38 WARN SparkContext: Killing executors is not supported by current scheduler.
23/12/01 05:22:11 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:641)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1111)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:244)
	at s

In [21]:
# transforms raw text into a numerical representation of the importance of each word

# cleaning stop words and get frequency and importance of words in combined text
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_rec['combined_features'])

In [22]:
# Cosine Similarity Matrix
# similarity b/w two texts based on their TF-IDF feature vectors
cosine_sim = cosine_similarity(tfidf_matrix)

In [23]:
pickle.dump(cosine_sim,open('cosine_sim.pkl','wb'))

In [24]:
with open('cosine_sim.pkl', 'rb') as file:
    # Load the contents of the file into a variable (e.g., cosine_sim_loaded)
    cosine_sim_loaded = pickle.load(file)

In [47]:
def steam_game_recommender(game_id):
    game_index = df_rec[df_rec.game_id == game_id].index.values[0]

    # generate similar games matrix
    similar_games = list(enumerate(cosine_sim_loaded[game_index]))

    # Sorting the similar games in descending order
    sorted_similar_games = sorted(similar_games, key = lambda x:x[1], reverse = True)

    i=0
    recommended_games = []
    for games in sorted_similar_games:
        if i != 0:
            game_id = df_rec[df_rec.index == games[0]]["game_id"].values[0]
            norm_score = (games[1] + 1) / 2 * 100
            name = df_rec[df_rec.index == games[0]]["name"].values[0]
            recommended_games.append({'game_id': game_id, 'rank': i, 'score': norm_score, 'game_name': name})
        i = i+1
        if i>10:
            break

    return recommended_games

In [48]:
steam_game_recommender(218620)

[{'game_id': 250320,
  'rank': 1,
  'score': 55.09333365360052,
  'game_name': 'The Wolf Among Us'},
 {'game_id': 537800,
  'rank': 2,
  'score': 54.707989223976725,
  'game_name': 'Bomber Crew'},
 {'game_id': 760060,
  'rank': 3,
  'score': 53.97490081191023,
  'game_name': 'Mutant Year Zero: Road to Eden'},
 {'game_id': 834910,
  'rank': 4,
  'score': 53.9724464926415,
  'game_name': 'ATLAS'},
 {'game_id': 626690,
  'rank': 5,
  'score': 53.68594290749332,
  'game_name': 'Sword Art Online: Fatal Bullet'},
 {'game_id': 206190,
  'rank': 6,
  'score': 53.48707919159003,
  'game_name': 'Gunpoint'},
 {'game_id': 312530,
  'rank': 7,
  'score': 53.475029795722875,
  'game_name': 'Duck Game'},
 {'game_id': 312660,
  'rank': 8,
  'score': 53.32378194220871,
  'game_name': 'Sniper Elite 4'},
 {'game_id': 620,
  'rank': 9,
  'score': 53.19991726342225,
  'game_name': 'Portal 2'},
 {'game_id': 646910,
  'rank': 10,
  'score': 53.13581075483956,
  'game_name': 'The Crew™ 2'}]