In [3]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
merged_df = pd.read_csv("../datasets/merged_steam_games_.csv")

# Select from data
recommend_colums = ["app_id", "user_id", "is_recommended"]
recommend_df = merged_df[recommend_colums]
recommend_df.dropna(inplace=True)
print(recommend_df.head(10))

   app_id   user_id  is_recommended
0  552520   7606333               1
1  552520  11805207               1
2  552520  10684038               1
3  552520    751598               1
4  552520   5432359               1
5  552520  10910665               1
6  552520   6160210               1
7  552520   6235310               1
8  552520    335223               1
9  552520   4686481               0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommend_df.dropna(inplace=True)


In [5]:
# Columns representing content characteristics
content_columns = merged_df.columns[31:58]
content_df = merged_df[content_columns]
content_df.dropna(inplace=True)
print(content_df.head(10))

   Action  Utilities  Animation & Modeling  Photo Editing  Education  Sports  \
0       1          0                     0              0          0       0   
1       1          0                     0              0          0       0   
2       1          0                     0              0          0       0   
3       1          0                     0              0          0       0   
4       1          0                     0              0          0       0   
5       1          0                     0              0          0       0   
6       1          0                     0              0          0       0   
7       1          0                     0              0          0       0   
8       1          0                     0              0          0       0   
9       1          0                     0              0          0       0   

   Audio Production  Casual  Web Publishing  Accounting  ...  Tutorial  RPG  \
0                 0       0             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df.dropna(inplace=True)


In [6]:
scaler = StandardScaler()
content_df = pd.DataFrame(scaler.fit_transform(content_df), columns=content_columns)


In [22]:
print(content_df.head(10))

     Action  Utilities  Animation & Modeling  Photo Editing  Education  \
0  0.756176  -0.173836             -0.173836            0.0        0.0   
1  0.756176  -0.173836             -0.173836            0.0        0.0   
2  0.756176  -0.173836             -0.173836            0.0        0.0   
3  0.756176  -0.173836             -0.173836            0.0        0.0   
4  0.756176  -0.173836             -0.173836            0.0        0.0   
5  0.756176  -0.173836             -0.173836            0.0        0.0   
6  0.756176  -0.173836             -0.173836            0.0        0.0   
7  0.756176  -0.173836             -0.173836            0.0        0.0   
8  0.756176  -0.173836             -0.173836            0.0        0.0   
9  0.756176  -0.173836             -0.173836            0.0        0.0   

    Sports  Audio Production   Casual  Web Publishing  Accounting  ...  \
0 -0.13021               0.0 -0.26689             0.0         0.0  ...   
1 -0.13021               0.0 -0.26689

In [13]:
print(merged_df.iloc[:, 31:58].columns)

Index(['Action', 'Utilities', 'Animation & Modeling', 'Photo Editing',
       'Education', 'Sports', 'Audio Production', 'Casual', 'Web Publishing',
       'Accounting', 'Documentary', 'Sexual Content', 'Adventure',
       'Video Production', 'Design & Illustration', 'Racing', 'Gore',
       'Tutorial', 'RPG', 'Indie', 'Software Training', 'Simulation',
       'Game Development', 'Massively Multiplayer', 'Early Access', 'Nudity',
       'Strategy'],
      dtype='object')


In [19]:
def Recommend_Games(user_id):
    user_df = merged_df[recommend_df["user_id"] == user_id]

    if user_df.empty:
        return []

    # Vector representing the user's preference
    user_vector = user_df.iloc[:, 31:58].mean().values.reshape(1, -1)

    sim_scores = cosine_similarity(user_vector, content_df)

    rc_games = sim_scores.argsort()[0][::-1]
    unique_rc_games = set(merged_df["title"].iloc[rc_games])

    return list(unique_rc_games)[:10]

In [21]:
user_id = 7606333
recommendations = Recommend_Games(user_id)
print(f"10 Games Recommend to {user_id} : \n{recommendations[:5]}")
print(recommendations[5:])

10 Games Recommend to 7606333 : 
['Mount & Blade: Warband', 'Left 4 Dead 2', 'The Forest', 'Bloons TD 6', 'ARK: Survival Evolved']
['World of Tanks Blitz', 'Project Zomboid', 'War Thunder', 'Team Fortress 2', 'Raft']


In [9]:
user_game_count = pd.DataFrame(merged_df.groupby("user_id")["app_id"].count())
# print(user_game_count[user_game_count['user_id'] > 10].shape)

over_10_users = user_game_count[user_game_count["app_id"] > 10].index
under_10_users = user_game_count[user_game_count["app_id"] <= 10].index
print(over_10_users)

train_X = []
test_X = []

for user in merged_df["user_id"].unique():
    X = merged_df[merged_df["user_id"] == user]

    if len(X) <= 10:
        train_X.append(X)
        continue

    train, test = train_test_split(X, test_size=0.2)
    train_X.append(train)
    test_X.append(test)

train_X = pd.concat(train_X)
test_X = pd.concat(test_X)

print(train_X.shape)
print(test_X.shape)

Int64Index([    1183,     2765,     4446,    15089,    16448,    48843,
               51559,    73191,    73274,    83283,
            ...
            13703989, 13708103, 13714056, 13715092, 13718408, 13720291,
            13735884, 13737779, 13755432, 13781520],
           dtype='int64', name='user_id', length=1417)
(41908, 59)
(4457, 59)


In [10]:
print(train_X.shape)
print(test_X.shape)

all_precisions = []
all_recalls = []
hitCount = 0

for user in over_10_users:
    user_test_X = test_X[test_X["user_id"] == user]
    actual_Y = pd.DataFrame(user_test_X[user_test_X["is_recommended"] == 1])

    predict_Y = pd.DataFrame(Recommend_Games(user), columns=["title"])

    if not actual_Y.empty and not predict_Y.empty:
        hit = len(set(actual_Y["title"]) & set(predict_Y["title"]))
        if hit > 0:
            hitCount += 1
        precision = hit / len(predict_Y["title"])
        recall = hit / len(actual_Y["title"])
        all_precisions.append(precision)
        all_recalls.append(recall)


average_precision = sum(all_precisions) / len(all_precisions)
average_recall = sum(all_recalls) / len(all_recalls)
HitRate = hitCount / len(over_10_users)

print("average_precision:", average_precision)
print("average_recall:", average_recall)
print("HitRate:", HitRate)


(41908, 59)
(4457, 59)
average_precision: 0.04854712969525188
average_recall: 0.168040228139449
HitRate: 0.4050811573747354
