In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Get data
def get_steam_data(file_path:str) -> pd.DataFrame:
  column_names = ['user_id', 'item_id', 'behaviour', 'hours']
  df = pd.read_csv(file_path, header=None, names=column_names, usecols=range(4))
  return df

# Função para capturar avaliações implícitas
def get_ratings(df: pd.DataFrame) -> pd.DataFrame:
  """Get implicit ratings per game"""
  df_user_consumption = (
      df
      .query('behaviour == "play"')[['user_id', 'item_id', 'hours']]
      .groupby(['user_id'])['hours']
      .sum()
      .reset_index()
      .rename({'hours': 'total_user_hours'}, axis=1)
  )

  df_ratings = (
    df
    .query('behaviour == "play"')[['user_id', 'item_id', 'hours']]
    .groupby(['user_id', 'item_id'])['hours']
    .sum()
    .reset_index()
    .merge(df_user_consumption, on='user_id')
  )

  df_ratings['rating'] = df_ratings['hours']/df_ratings['total_user_hours']
  df_ratings.drop(columns=['hours', 'total_user_hours'], inplace=True)

  return df_ratings


# Classe genérica para recomendação
class ItemBasedRecommender:
  
  def __init__(self, data, item_col, user_col, score_col, aggfunc=np.mean):
    self.data = data.copy()
    self.item_col = item_col
    self.user_col = user_col
    self.score_col = score_col
    self.aggfunc = aggfunc
 
  def fit(self, sample_size=None, normalize=False, n_most_popular=10):
    
    if sample_size is not None:
      self.item_sample_ = self.data.groupby(self.item_col)[self.user_col] \
        .nunique() \
        .sort_values(ascending=False) \
        .to_frame('nunique_customers') \
        .head(sample_size) \
        .index.tolist()
      self.data = self.data[self.data[self.item_col].isin(self.item_sample_)]

    self.scores_ = self.data.groupby(self.item_col).agg(**{
        f'{self.score_col}_{self.aggfunc.__name__}': (self.score_col, self.aggfunc),
        f'{self.score_col}_count': ('rating', 'count')
        }).sort_values(f'{self.score_col}_count', ascending=False)

    self.n_most_popular_ = self.data[self.item_col].value_counts().nlargest(n_most_popular).index

    self.data_pivot_ = self.data.pivot(index=self.item_col, columns=self.user_col, values=self.score_col)
    if normalize:
      avg_ratings = self.data_pivot_.mean(axis=0)
      self.data_pivot_ = self.data_pivot_.sub(avg_ratings, axis=1).fillna(0)
    else:
      self.data_pivot_ = self.data_pivot_.fillna(0)

    self.sim_matrix_ = cosine_similarity(self.data_pivot_)
    self.sim_matrix_ = pd.DataFrame(self.sim_matrix_, index=self.data_pivot_.index, columns=self.data_pivot_.index)
    return self
    
  def recommend(self, target_item, max_recommendations=None):
    try:
      return self.sim_matrix_.loc[target_item].drop(target_item).sort_values(ascending=False).head(max_recommendations)
    except KeyError as e:
      print(f'\033[1m{target_item}\033[0;0m is not included in the recommendation matrix. Returning top 10 items:\n')
      return self.n_most_popular_

  def fit_recommend(self, target_item):
    return self.fit().recommend(target_item)


In [5]:
df = get_steam_data('../data/steam-200k.csv')
df

Unnamed: 0,user_id,item_id,behaviour,hours
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0
...,...,...,...,...
199995,128470551,Titan Souls,play,1.5
199996,128470551,Grand Theft Auto Vice City,purchase,1.0
199997,128470551,Grand Theft Auto Vice City,play,1.5
199998,128470551,RUSH,purchase,1.0


In [6]:
df_ratings = get_ratings(df)
df_ratings

Unnamed: 0,user_id,item_id,rating
0,5250,Alien Swarm,0.021729
1,5250,Cities Skylines,0.638581
2,5250,Deus Ex Human Revolution,0.274945
3,5250,Dota 2,0.000887
4,5250,Portal 2,0.060310
...,...,...,...
70472,309434439,Dota 2,1.000000
70473,309554670,Mitos.is The Game,1.000000
70474,309626088,Age of Empires II HD Edition,1.000000
70475,309824202,Dota 2,1.000000


In [8]:
recommender = ItemBasedRecommender(
    data=df_ratings,
    item_col='item_id',
    user_col='user_id',
    score_col='rating',
    aggfunc=np.sum
)

In [9]:
print('Coluna que identifica os itens:', recommender.item_col)
print('Coluna que identifica os usuários:', recommender.user_col)
print('Coluna que identifica as avaliações:', recommender.score_col)

Coluna que identifica os itens: item_id
Coluna que identifica os usuários: user_id
Coluna que identifica as avaliações: rating


In [10]:
recommender.fit()

<__main__.ItemBasedRecommender at 0x7fafb60b73d0>

In [11]:
recommender.n_most_popular_

Index(['Dota 2', 'Team Fortress 2', 'Counter-Strike Global Offensive',
       'Unturned', 'Left 4 Dead 2', 'Counter-Strike Source',
       'The Elder Scrolls V Skyrim', 'Garry's Mod', 'Counter-Strike',
       'Sid Meier's Civilization V'],
      dtype='object')

In [12]:
recommender.scores_

Unnamed: 0_level_0,rating_sum,rating_count
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
Dota 2,3904.304091,4841
Team Fortress 2,990.560459,2323
Counter-Strike Global Offensive,541.811807,1377
Unturned,321.379284,1069
Left 4 Dead 2,103.398419,801
...,...,...
Starion Tactics,0.000786,1
Gateways,0.000081,1
Community College Hero Trial by Fire,0.005879,1
Starscape,0.001837,1


In [13]:
recommender.data_pivot_

user_id,5250,76767,86540,144736,181212,229911,298950,381543,547685,554278,...,309228590,309255941,309262440,309265377,309404240,309434439,309554670,309626088,309824202,309903146
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Second Ninja,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rymdkapsel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00039,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
recommender.sim_matrix_

item_id,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007 Legends,1.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
0RBITALIS,0.0,1.000000,0.000000e+00,0.077751,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,2.367356e-05,0.000000
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),0.0,0.000000,1.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,2.948747e-07,0.000000e+00,0.000000e+00,0.000000
10 Second Ninja,0.0,0.077751,0.000000e+00,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
10000000,0.0,0.000000,0.000000e+00,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rymdkapsel,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,1.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000
sZone-Online,0.0,0.000000,2.948747e-07,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000e+00,9.935406e-06,4.747041e-05,0.000000
the static speaks my name,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,9.935406e-06,1.000000e+00,6.734307e-07,0.000000
theHunter,0.0,0.000024,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000005,0.0,4.747041e-05,6.734307e-07,1.000000e+00,0.000138


Recomendação baseada em pessoas que jogaram muitas horas o jogo escolhido, também jogaram muitas horas nos outros que vão aparecer

In [15]:
recommender.recommend('Batman Arkham City', 10)

item_id
Star Wars The Clone Wars Republic Heroes    0.708706
Chessmaster                                 0.708706
EDGE                                        0.701947
Doctor Who The Eternity Clock               0.373133
The Sims(TM) Medieval                       0.265893
Magic The Gathering  Tactics                0.261532
Crazy Taxi                                  0.239978
Dragon The Game                             0.200712
Buccaneer The Pursuit of Infamy             0.190686
Blood Bowl Dark Elves Edition               0.190686
Name: Batman Arkham City, dtype: float64