# Carga de datos
Cargo los datos de las reviews, la metadata de todos los productos y los ratings de los usuarios.


In [219]:
import pandas as pd

meta = pd.read_csv('./Data/steam.csv')


ratings = pd.read_csv('./Data/steam-200k.csv', header=None, names=['user_id', 'name', 'purchase', 'hours','0'])

ratings.head(10)

Unnamed: 0,user_id,name,purchase,hours,0
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0
5,151603712,Spore,play,14.9,0
6,151603712,Fallout New Vegas,purchase,1.0,0
7,151603712,Fallout New Vegas,play,12.1,0
8,151603712,Left 4 Dead 2,purchase,1.0,0
9,151603712,Left 4 Dead 2,play,8.9,0


In [41]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   200000 non-null  int64  
 1   name      200000 non-null  object 
 2   purchase  200000 non-null  object 
 3   hours     200000 non-null  float64
 4   0         200000 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 7.6+ MB


In [42]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   appid             27075 non-null  int64  
 1   name              27075 non-null  object 
 2   release_date      27075 non-null  object 
 3   english           27075 non-null  int64  
 4   developer         27074 non-null  object 
 5   publisher         27061 non-null  object 
 6   platforms         27075 non-null  object 
 7   required_age      27075 non-null  int64  
 8   categories        27075 non-null  object 
 9   genres            27075 non-null  object 
 10  steamspy_tags     27075 non-null  object 
 11  achievements      27075 non-null  int64  
 12  positive_ratings  27075 non-null  int64  
 13  negative_ratings  27075 non-null  int64  
 14  average_playtime  27075 non-null  int64  
 15  median_playtime   27075 non-null  int64  
 16  owners            27075 non-null  object

In [220]:
# Divide las categorías en la columna 'genres'
meta['genres'] = meta['genres'].str.split(';')

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

genres_expanded = pd.DataFrame(mlb.fit_transform(meta['genres']),columns=mlb.classes_, index=meta.index)

# Concatena las nuevas columnas binarias al DataFrame original
meta = pd.concat([meta, genres_expanded], axis=1)



In [75]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   appid                  27075 non-null  int64  
 1   name                   27075 non-null  object 
 2   release_date           27075 non-null  object 
 3   english                27075 non-null  int64  
 4   developer              27074 non-null  object 
 5   publisher              27061 non-null  object 
 6   platforms              27075 non-null  object 
 7   required_age           27075 non-null  int64  
 8   categories             27075 non-null  object 
 9   genres                 27075 non-null  object 
 10  steamspy_tags          27075 non-null  object 
 11  achievements           27075 non-null  int64  
 12  positive_ratings       27075 non-null  int64  
 13  negative_ratings       27075 non-null  int64  
 14  average_playtime       27075 non-null  int64  
 15  me

# Filtro basado en memoria

Usando una matriz de similitud podemos saber la similitud de un titulo con otro y los más similares basados en una serie de características. El mayor problema es la memoria y la escalabilidad. 

In [8]:
%pip install gower

Note: you may need to restart the kernel to use updated packages.


In [221]:
from sklearn.preprocessing import MinMaxScaler

# Variables numéricas para escalar
numeric_cols_to_scale = ['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime']

# Aplicar Min-Max scaling a las variables numéricas seleccionadas
scaler = MinMaxScaler()
meta[numeric_cols_to_scale] = scaler.fit_transform(meta[numeric_cols_to_scale])


In [301]:
from gower import gower_matrix

# Lista de columnas categóricas
categorical_columns = ['name','Action', 'Adventure','Casual','Early Access',
                       'Free to Play', 'Gore', 'Indie','Massively Multiplayer', 'RPG', 'Racing',
                       'Simulation', 'Sports','Strategy', 'Violent']

# Lista de columnas numéricas
numeric_columns = [ 'positive_ratings', 'negative_ratings','price', 'owners']

# Seleccionar solo las columnas que vas a utilizar
metadata_selected = meta[categorical_columns + numeric_columns]

# Ordenar el DataFrame por la columna 'owners' en orden descendente
metadata_selected = metadata_selected.sort_values(by='positive_ratings', ascending=False)

# Tomar los primeros n valores (por ejemplo, los 10 juegos más populares)
n = 5000
metadata_selected = metadata_selected.head(n)

# Calcular la matriz de distancias de Gower
gower_dist_matrix = gower_matrix(metadata_selected.drop('name',axis=1))

# Convertir la matriz de distancias a un DataFrame
gower_dist_df = pd.DataFrame(gower_dist_matrix, columns=metadata_selected.index, index=metadata_selected.index)

# Mostrar el DataFrame de distancias
print(gower_dist_df)


          25        22        19        12836     121       2478      1467   \
25     0.000000  0.178211  0.142283  0.241202  0.374820  0.263582  0.260659   
22     0.178211  0.000000  0.130738  0.344578  0.363276  0.252037  0.249114   
19     0.142283  0.130738  0.000000  0.294042  0.288093  0.200869  0.179061   
12836  0.241202  0.344578  0.294042  0.000000  0.404420  0.155751  0.289527   
121    0.374820  0.363276  0.288093  0.404420  0.000000  0.250125  0.228317   
...         ...       ...       ...       ...       ...       ...       ...   
18923  0.379219  0.367675  0.292492  0.419039  0.292397  0.318844  0.296179   
7056   0.445729  0.323074  0.359002  0.463632  0.348687  0.363436  0.240626   
3676   0.386732  0.375188  0.300005  0.411507  0.178579  0.311311  0.181629   
12641  0.268109  0.256564  0.181382  0.307929  0.403509  0.207734  0.185068   
15514  0.498363  0.486818  0.411636  0.411003  0.401321  0.310807  0.404371   

          3362      1120      21     ...     18560 

In [None]:
%pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [126]:
def recommend_games(game_input, n=5):

    #Le paso el nombre del juego que tiene que si o si estar en meta (mi universo de juegos) y el n de recomendaciones. Default 4 

    try:
        # Buscar juegos cuyos nombres contengan la cadena de búsqueda en metadata_selected
        matching_games = metadata_selected[metadata_selected['name'].str.contains(game_input, case=False)].copy()

        if not matching_games.empty:
            # Si el juego está en metadata_selected, obtener su índice
            game_index = matching_games.index[0]
            #print(game_index, metadata_selected[metadata_selected['name']==game_input],matching_games)

            # Obtener los juegos más similares
            similarity_score = list(enumerate(gower_dist_matrix[game_index]))
            similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=False)
            similarity_score = similarity_score[1:n]

            # Retornar los nombres de los juegos utilizando el DataFrame meta
            game_indices = [i[0] for i in similarity_score]
            return meta[['name', 'price']].iloc[game_indices]

        else:
            print("Realizando una busqueda más amplia. Espere por favor.")
            # Si el juego no está en metadata_selected, agregarlo a metadata_selected_2
            metadata_selected_2 = metadata_selected.copy()
            metadata_selected_2 = metadata_selected_2.append(meta[meta['name'].str.contains(game_input, case=False)])
            metadata_selected_2.reset_index(drop=True, inplace=True)

            # Recalcular la matriz de distancias de Gower
            gower_dist_matrix_2 = gower_matrix(metadata_selected_2.drop('name',axis=1).head(1000))

            # Obtener el índice del juego buscado en metadata_selected_2
            game_index_2 = metadata_selected_2[metadata_selected_2['name'].str.contains(game_input, case=False)].index[0]

            # Obtener los juegos más similares en metadata_selected_2
            similarity_score_2 = list(enumerate(gower_dist_matrix_2[game_index_2]))
            similarity_score_2 = sorted(similarity_score_2, key=lambda x: x[1], reverse=False)
            similarity_score_2 = similarity_score_2[1:n]

            # Retornar los nombres de los juegos utilizando el DataFrame meta
            game_indices_2 = [i[0] for i in similarity_score_2]
            return meta[['name', 'price']].iloc[game_indices_2]

    except IndexError:
        # Si ocurre un IndexError, imprime un mensaje y devuelve None o un DataFrame vacío
        print(f"No se encontraron juegos que contengan '{game_input}'.")
        return None


In [134]:
recommend_games('Counter-Strike',25)

Unnamed: 0,name,price
219,BioShock Infinite,19.99
68,Call of Duty: United Offensive,14.99
107,Sniper Elite,5.59
120,Civilization IV®: Warlords,2.99
75,X: Beyond the Frontier,3.49
189,X-Blades,6.99
103,Plants vs. Zombies GOTY Edition,4.25
2317,Even the Ocean,10.99
153,Thrillville®: Off the Rails™,7.19
98,Amazing Adventures Around the World,4.25


In [128]:
recommend_games("Spore", 10)

Unnamed: 0,name,price
526,Eschalon: Book II,4.79
518,Medal of Honor: Airborne,9.99
653,Chronicles of Mystery: The Scorpio Ritual,3.99
1052,Hydrophobia: Prophecy,3.99
899,Hotel Dash™ Suite Success™,4.99
700,Fitness Dash™,4.99
2175,Valiant Hearts: The Great War™ / Soldats Incon...,12.49
1376,Starbound,11.99
7081,Tank: M1A1 Abrams Battle Simulation,4.99


# Sistema de recomendacion filtros colaborativos

Hay que analizar los df y crear mi matriz de item-usuario para poder crear el algoritmo de filtro colaborativo.

In [222]:
ratings = ratings[ratings['purchase']!= 'purchase']

Como no tengo metadata de todos los juegos en _ratings_, uno con _meta_ por nombre (estan bien escritos) y asi puedo saber genero y mas cosas y matengo los id de juego que cree antes.

In [223]:
# Puedes crear un diccionario que asigne un ID único a cada nombre único
name_id_dict = {name: idx + 1 for idx, name in enumerate(ratings['name'].unique())}

# Agrega una nueva columna "game_id" al DataFrame usando el diccionario
ratings['game_id'] = ratings['name'].map(name_id_dict)

ratings_meta = pd.merge(ratings,meta, on='name')

#metadata = ratings_meta['game_id', 'name', 'release_date  ', 'developer  ', 'price', 'positive_ratings']#.groupby('game_id')

In [228]:
ratings_meta.columns

Index(['user_id', 'name', 'purchase', 'hours', '0', 'game_id', 'appid',
       'release_date', 'english', 'developer', 'publisher', 'platforms',
       'required_age', 'categories', 'genres', 'steamspy_tags', 'achievements',
       'positive_ratings', 'negative_ratings', 'average_playtime',
       'median_playtime', 'owners', 'price', 'Accounting', 'Action',
       'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual',
       'Design & Illustration', 'Documentary', 'Early Access', 'Education',
       'Free to Play', 'Game Development', 'Gore', 'Indie',
       'Massively Multiplayer', 'Nudity', 'Photo Editing', 'RPG', 'Racing',
       'Sexual Content', 'Simulation', 'Software Training', 'Sports',
       'Strategy', 'Tutorial', 'Utilities', 'Video Production', 'Violent',
       'Web Publishing'],
      dtype='object')

In [241]:
metadata=ratings_meta.loc[:,['game_id', 'name','release_date', 'developer', 'price', 'positive_ratings']]

In [245]:


# Identificar juegos que aparecen exactamente una vez por game_id
metadata = metadata[~metadata.duplicated(subset=['game_id'], keep=False)].copy()



In [169]:
from surprise  import Dataset
from surprise  import Reader
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Ajusta el escalador a los datos y transforma la columna 'hours'
ratings['hours'] = scaler.fit_transform(ratings[['hours']])

reader = Reader(rating_scale=(min_value, max_value))
data = Dataset.load_from_df(ratings[['user_id', 'game_id', 'hours']], reader)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['hours'] = scaler.fit_transform(ratings[['hours']])


In [108]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [170]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0243  0.9202  1.0051  0.9832  0.0452  
MAE (testset)     0.4196  0.4175  0.4085  0.4152  0.0048  
Fit time          0.41    0.38    0.41    0.40    0.01    
Test time         0.22    0.18    0.18    0.19    0.02    


{'test_rmse': array([1.02425505, 0.9201959 , 1.00509591]),
 'test_mae': array([0.41957354, 0.41751739, 0.40845188]),
 'fit_time': (0.41190242767333984, 0.38234758377075195, 0.4118995666503906),
 'test_time': (0.21741437911987305, 0.17557287216186523, 0.18453192710876465)}

In [171]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20fdb5445b0>

In [172]:
trainset.n_users

11350

In [160]:
print(name_id_dict.get("The Elder Scrolls V Skyrim"))

1


In [175]:

pred = svd.predict(uid=128470551, iid=name_id_dict.get("The Elder Scrolls V Skyrim"))

# Utiliza inverse_transform para obtener las horas jugadas en la escala original
horas_jugadas_scaled = pred.est
horas_jugadas_original_scale = scaler.inverse_transform([[horas_jugadas_scaled]])

print(horas_jugadas_original_scale)


[[1.04425347]]


In [176]:
ratings[ratings['user_id']==128470551]

Unnamed: 0,user_id,name,purchase,hours,0,game_id
199969,128470551,The Binding of Isaac Rebirth,play,1.055763,0,501
199971,128470551,Path of Exile,play,-0.029992,0,7
199973,128470551,Arma 2 DayZ Mod,play,-0.117201,0,280
199975,128470551,Antichamber,play,-0.139875,0,293
199977,128470551,Risk of Rain,play,-0.14598,0,247
199979,128470551,OlliOlli,play,-0.166038,0,2133
199981,128470551,Hammerwatch,play,-0.173451,0,393
199983,128470551,Torchlight II,play,-0.200485,0,657
199985,128470551,Nether,play,-0.200922,0,1994
199987,128470551,Rogue Legacy,play,-0.201794,0,129


In [287]:
import difflib
import random

def get_game_id(game_title, name_id_dict):
    closest_title = difflib.get_close_matches(game_title, name_id_dict.keys(), n=1)
    if closest_title:
        return name_id_dict[closest_title[0]], closest_title[0]
    else:
        print(f"El juego {game_title} no fue reconocido. Perdón.")
        return None, None

def predict_review(user_id, game_title, model, name_id_dict):
    game_id, _ = get_game_id(game_title, name_id_dict)
    if game_id is not None:
        review_prediction = model.predict(uid=user_id, iid=game_id)
        return review_prediction.est
    else:
        return None

def generate_recommendation(user_id, model, name_id_dict, thresh=1):
    game_titles = list(name_id_dict.keys())
    random.shuffle(game_titles)

    recomenation = []

    for game_title in game_titles:
        rating = predict_review(user_id, game_title, model, name_id_dict)
        if rating is not None and rating >= thresh:
            recomenation.append(game_title)
    print(recomenation)

In [288]:
rec = generate_recommendation(128470551, svd,name_id_dict)

['Rescue Team 5', 'Space Empires V', 'Mount & Blade', 'The Treasures of Montezuma 4']


# Deploy

Haremos el deploy con Streamlit

In [None]:
#%pip install pipreqs

In [296]:
#!pipreqs .

Traceback (most recent call last):
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\lib\site-packages\requests\compat.py", line 11, in <module>
    import chardet
ModuleNotFoundError: No module named 'chardet'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Usuario\miniconda3\envs\itba_apa_env\Scripts\pipreqs.exe\__main__.py", line 4, in <module>
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\lib\site-packages\pipreqs\pipreqs.py", line 47, in <module>
    import requests
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\lib\site-packages\requests\__init__.py", line 45, in <module>
  File "c:\Users\Usuario\miniconda3\envs\itba_apa_env\

In [None]:
#%pip install streamlit

In [None]:
#%pip install flask


In [297]:
#%pip install pickle

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


In [299]:
import pickle
with open('./Data/gower.pkl', 'wb') as f_gower:
    pickle.dump(gower_dist_df,f_gower)
with open('./Data/metadata.pkl', 'wb') as f_metadata:
    pickle.dump(metadata_selected,f_metadata)

In [None]:
from flask import Flask

app = Flask('recsys_gower')

