In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [2]:
# Cargamos el dataframe
df_model = pd.read_parquet('https://github.com/xaviac/storage__PI_MLOp/raw/main/data/clean/sg_explode.parquet.gz')

In [3]:
df_model

Unnamed: 0,genres,title,price,id,developer,release_year
0,Action,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
1,Casual,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
2,Indie,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
3,Simulation,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
4,Strategy,Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
...,...,...,...,...,...,...
85899,Adventure,Maze Run VR,4.99,681550,Unknown,2015
85900,Indie,Maze Run VR,4.99,681550,Unknown,2015
85901,Action,Maze Run VR,4.99,681550,Unknown,2015
85902,Simulation,Maze Run VR,4.99,681550,Unknown,2015


In [4]:
# Seleccionamos las columnas que nos interesan
df_model = df_model[['id', 'title', 'genres', 'developer']]

In [5]:
# Combinamos caracteristicas en una nueva columna llamada features
df_model['features'] = df_model['genres'] + ' ' + df_model['developer']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['features'] = df_model['genres'] + ' ' + df_model['developer']


In [6]:
df_model

Unnamed: 0,id,title,genres,developer,features
0,761140,Lost Summoner Kitty,Action,Kotoshiro,Action Kotoshiro
1,761140,Lost Summoner Kitty,Casual,Kotoshiro,Casual Kotoshiro
2,761140,Lost Summoner Kitty,Indie,Kotoshiro,Indie Kotoshiro
3,761140,Lost Summoner Kitty,Simulation,Kotoshiro,Simulation Kotoshiro
4,761140,Lost Summoner Kitty,Strategy,Kotoshiro,Strategy Kotoshiro
...,...,...,...,...,...
85899,681550,Maze Run VR,Adventure,Unknown,Adventure Unknown
85900,681550,Maze Run VR,Indie,Unknown,Indie Unknown
85901,681550,Maze Run VR,Action,Unknown,Action Unknown
85902,681550,Maze Run VR,Simulation,Unknown,Simulation Unknown


In [7]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85904 entries, 0 to 85903
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         85904 non-null  int32 
 1   title      85904 non-null  object
 2   genres     85904 non-null  object
 3   developer  85904 non-null  object
 4   features   85904 non-null  object
dtypes: int32(1), object(4)
memory usage: 2.9+ MB


In [8]:
# Agrupamos por 'id'
df_model = df_model.groupby('id').first().reset_index()

In [9]:
df_model

Unnamed: 0,id,title,genres,developer,features
0,10,Counter-Strike,Action,Valve,Action Valve
1,20,Team Fortress Classic,Action,Valve,Action Valve
2,30,Day of Defeat,Action,Valve,Action Valve
3,40,Deathmatch Classic,Action,Valve,Action Valve
4,50,Half-Life: Opposing Force,Action,Gearbox Software,Action Gearbox Software
...,...,...,...,...,...
32127,2028055,Tom Clancy's Ghost Recon Future Soldier - Seas...,Action,"Ubisoft Paris,Red Storm Entertainment","Action Ubisoft Paris,Red Storm Entertainment"
32128,2028056,Worms Revolution Season Pass,Strategy,Team17 Digital Ltd.,Strategy Team17 Digital Ltd.
32129,2028062,Call of Duty®: Black Ops II Season Pass,Action,Treyarch,Action Treyarch
32130,2028103,Assassin’s Creed® III Season Pass,Action,Ubisoft Montreal,Action Ubisoft Montreal


In [10]:
# Guardamos el dataframe en un fichero csv
df_model.to_csv('../data/model/df_model.csv.gz', index=False, compression='gzip', encoding='utf-8')

# Guardamos el dataframe en un fichero parquet
df_model.to_parquet('../data/model/df_model.parquet.gz', index=False, compression='gzip')

In [11]:
# Nos quedamos con una muestra del 50% de los datos para entrenar el modelo
df_model_fit = df_model.sample(frac=0.5, random_state=42)

In [12]:
df_model_fit

Unnamed: 0,id,title,genres,developer,features
12704,398170,Evochron Legacy,Indie,StarWraith 3D Games LLC,Indie StarWraith 3D Games LLC
29815,711140,Forsaken Generation,RPG,MSRX22,RPG MSRX22
6531,301920,War of the Human Tanks - ALTeR,Adventure,Yakiniku Banzai,Adventure Yakiniku Banzai
25248,616350,Guns of Icarus Alliance Costume Pack,Action,Muse Games,Action Muse Games
20614,534600,Steam Dev Days,Game Development,Unknown,Game Development Unknown
...,...,...,...,...,...
9869,356560,East Tower - Kurenai,Adventure,roseVeRte,Adventure roseVeRte
27052,653360,Schoolgirl Strikers,Anime,Unknown,Anime Unknown
3685,238280,Legend of Dungeon,Rogue-like,Unknown,Rogue-like Unknown
8359,334740,Warhammer Quest - Deluxe Pack items,Strategy,"Twistplay,Rodeo Games","Strategy Twistplay,Rodeo Games"


In [13]:
df_model_fit.reset_index(drop=True, inplace=True)

In [14]:
df_model_fit['title'].value_counts()

title
Puzzle Blocks                                                              2
Rumpus                                                                     2
Parasite                                                                   2
Dark Matter                                                                2
Borealis                                                                   2
                                                                          ..
Rocksmith® 2014 – Shinedown - “Enemies”                                    1
NOBUNAGA'S AMBITION: Tenshouki WPK HD Version - GAMECITYオンラインユーザー登録シリアル    1
Cube Racer                                                                 1
THE AWKWARD STEVE DUOLOGY                                                  1
Fantasy Grounds - Heroic Characters 15 (Token Pack)                        1
Name: count, Length: 16059, dtype: int64

In [16]:
# Guardamos el dataframe en un fichero parquet
df_model_fit.to_parquet('../data/model/df_model_fit.parquet.gz', compression='gzip')

# Guardamos el dataframe en un fichero csv
df_model_fit.to_csv('../data/model/df_model_fit.csv.gz', compression='gzip', index=False, encoding='utf-8')

In [17]:
# Entrenamos el modelo
CV = CountVectorizer()
count_matrix = CV.fit_transform(df_model_fit['features'])


In [18]:
 
cosine_similarity = cosine_similarity(count_matrix)

In [19]:
# Guardamos el modelo en un fichero pkl sin comprimir
joblib.dump(cosine_similarity, '../model/cosine_similarity.pkl', compress=9)

['../model/cosine_similarity.pkl']

In [20]:
#Cargar el modelo entrenado desde el archivo pickle
with open('../model/cosine_similarity.pkl', 'rb') as file:
    modelo = joblib.load(file)

df_model_fit = pd.read_parquet('https://github.com/xaviac/storage__PI_MLOp/raw/main/data/model/df_model_fit.parquet.gz')

In [21]:
def recomendacion_juego(item_id: int):
    
    if item_id not in df_model_fit['id'].tolist():
       return {'Respuesta':'No se encontraron resultados para el item_id: {}'.format(item_id)}

    def get_recommendations(idx, cosine_sim=modelo):
       idx = df_model_fit[df_model_fit['id'] == item_id].index[0]
       sim_scores = list(enumerate(cosine_sim[idx]))
       sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
       sim_scores = sim_scores[1:6]
       game_indices = [i[0] for i in sim_scores]
       return df_model_fit['title'].iloc[game_indices].tolist()

    #Obtener el índice del item_id

    recommendations = get_recommendations(item_id)
    return {"Recomendaciones": recommendations}

In [22]:
recomendacion_juego(617470)

{'Recomendaciones': ['Fantasy Grounds - Lost Library of Thoth (5E)',
  "Fantasy Grounds - Kith'takharos: Seal the Rift (3.5E/PFRPG)",
  'Fantasy Grounds - Dungeonlands: Consort of the Lich Queen (Savage Worlds)',
  'Fantasy Grounds - The Book of the Righteous (5E)',
  'Fantasy Grounds - Iron Heroes Counter Collection (Token Pack)']}