## Modelo de Machine Learning

Importamos librerías

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

Importamos el csv de games ya organizado

In [2]:
games = pd.read_csv('datasets/games_cleaned.csv')

In [3]:
games.head()

Unnamed: 0,publisher,genres,app_name,tags,discount_price,specs,price,early_access,id,developer,metascore,year
0,Kotoshiro,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...",4.49,['Single-player'],4.99,False,761140.0,Kotoshiro,,2018
1,彼岸领域,"['Action', 'Adventure', 'Casual']",弹炸人2222,"['Action', 'Adventure', 'Casual']",0.83,['Single-player'],0.99,False,767400.0,彼岸领域,,2017
2,Stegalosaurus Game Development,"['Action', 'Adventure', 'Casual', 'Indie', 'RPG']",Army of Tentacles: (Not) A Cthulhu Dating Sim:...,"['Action', 'Adventure', 'RPG', 'Indie', 'Casual']",8.79,"['Single-player', 'Steam Achievements']",10.99,False,770380.0,Stegalosaurus Game Development,,2018
3,Apillo,"['Adventure', 'Casual', 'Indie', 'Simulation',...",The first thrust of God - All Aircrafts,"['Strategy', 'Adventure', 'Indie', 'Casual', '...",1.59,"['Single-player', 'Downloadable Content', 'Ste...",1.99,False,773690.0,Apillo,,2018
4,Tero Lunkka,"['Action', 'Adventure', 'Indie']",The Warrior Of Treasures,"['Action', 'Adventure', 'Indie']",0.59,['Single-player'],0.99,False,768060.0,Tero Lunkka,,2018


Previamente se creó otro DataFrame con las columnas para 5 géneros por juego para que el modelo las pueda codificar

In [4]:
games_ml = pd.read_csv('datasets/dataset_modelo_ml.csv')

In [5]:
games_ml.head()

Unnamed: 0,publisher,app_name,discount_price,price,early_access,id,developer,metascore,year,genre1,genre2,genre3,genre4,genre5
0,Kotoshiro,Lost Summoner Kitty,4.49,4.99,False,761140,Kotoshiro,,2018,Action,Casual,Indie,Simulation,Strategy
1,彼岸领域,弹炸人2222,0.83,0.99,False,767400,彼岸领域,,2017,Action,Adventure,Casual,,
2,Stegalosaurus Game Development,Army of Tentacles: (Not) A Cthulhu Dating Sim:...,8.79,10.99,False,770380,Stegalosaurus Game Development,,2018,Action,Adventure,Casual,Indie,RPG
3,Apillo,The first thrust of God - All Aircrafts,1.59,1.99,False,773690,Apillo,,2018,Adventure,Casual,Indie,Simulation,Strategy
4,Tero Lunkka,The Warrior Of Treasures,0.59,0.99,False,768060,Tero Lunkka,,2018,Action,Adventure,Indie,,


Ahora, de este DataFrame quitaremos las columnas [publisher, app_name, developer, metascore] para que el modelo pueda entrenarse

In [6]:
games_ml.drop(columns=['publisher', 'app_name', 'developer', 'metascore'], inplace=True)

In [7]:
games_ml.head()

Unnamed: 0,discount_price,price,early_access,id,year,genre1,genre2,genre3,genre4,genre5
0,4.49,4.99,False,761140,2018,Action,Casual,Indie,Simulation,Strategy
1,0.83,0.99,False,767400,2017,Action,Adventure,Casual,,
2,8.79,10.99,False,770380,2018,Action,Adventure,Casual,Indie,RPG
3,1.59,1.99,False,773690,2018,Adventure,Casual,Indie,Simulation,Strategy
4,0.59,0.99,False,768060,2018,Action,Adventure,Indie,,


Preprocesamos los datos para que queden codificados antes de enviarlos al modelo

In [8]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, drop='first')

categorical_columns = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5']

encoded_categories = encoder.fit_transform(games_ml[categorical_columns])

encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))

games_encoded = pd.concat([games_ml, encoded_df], axis=1)

games_encoded.drop(categorical_columns, axis=1, inplace=True)

Vemos como se codificó la información

In [9]:
games_encoded.head()

Unnamed: 0,discount_price,price,early_access,id,year,genre1_Adventure,genre1_Casual,genre1_Free to Play,genre1_Indie,genre1_Massively Multiplayer,...,genre4_Simulation,genre4_Sports,genre4_Strategy,genre4_nan,genre5_Massively Multiplayer,genre5_RPG,genre5_Racing,genre5_Simulation,genre5_Strategy,genre5_nan
0,4.49,4.99,False,761140,2018,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.83,0.99,False,767400,2017,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,8.79,10.99,False,770380,2018,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.59,1.99,False,773690,2018,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.59,0.99,False,768060,2018,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


Entrenamos el modelo usando NearestNeighbors con la métrica de distancia coseno

In [10]:
from sklearn.neighbors import NearestNeighbors

n_neighbors=5

nneighbors = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(games_encoded)

Procedemos a predecir

In [11]:
new_data_point = games_encoded.iloc[0].values.reshape(1, -1)
distances, indices = nneighbors.kneighbors(new_data_point)
neighbor_data = games['app_name'].iloc[indices[0]]
neighbor_data



0                     Lost Summoner Kitty
6                                 Souland
117    Snail Trek - Chapter 3: Lettuce Be
114                Ender Story: Chapter 1
92                               Daydream
Name: app_name, dtype: object

In [19]:
registro = games_encoded.loc[games['id'] == 768060].values.reshape(1, -1)
distances, indices = nneighbors.kneighbors(registro)
neighbor_data = games['app_name'].iloc[indices[0]]
neighbor_data



4            The Warrior Of Treasures
1                             弹炸人2222
83    Dolphins-cyborgs and open space
97                  Once in Yaissor 2
5                 Vaporwave Simulator
Name: app_name, dtype: object

In [20]:
def Recomendaciones(id):
    
    import pandas as pd
    games = pd.read_csv('datasets/games_cleaned.csv')
    games_ml = pd.read_csv('datasets/dataset_modelo_ml.csv')
    games_ml.drop(columns=['publisher', 'app_name', 'developer', 'metascore'], inplace=True)
    
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(sparse=False, drop='first')
    categorical_columns = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5']
    encoded_categories = encoder.fit_transform(games_ml[categorical_columns])
    encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))
    games_encoded = pd.concat([games_ml, encoded_df], axis=1)
    games_encoded.drop(categorical_columns, axis=1, inplace=True)

    from sklearn.neighbors import NearestNeighbors
    n_neighbors=5
    nneighbors = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(games_encoded)

    registro = games_encoded.loc[games['id'] == id].values.reshape(1, -1)
    distances, indices = nneighbors.kneighbors(registro)
    neighbor_data = games['app_name'].iloc[indices[0]]
    return neighbor_data