# ETL

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import ast
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Steam Games

### Cargamos el dataframe a usar

In [40]:
# Leer el archivo línea por línea y almacenar los resultados en una lista
filas = []

with open("../data/output_steam_games.json", "r", encoding="Latin-1") as archivo:
    for linea in archivo:
        try:
            objeto_json = json.loads(linea)
            filas.append(objeto_json)
        except json.JSONDecodeError:
            print(f"Error de formato JSON en la línea: {linea}")

#Convertir la lista de objetos JSON en un DataFrame
steam_games = pd.DataFrame(filas)

In [41]:
steam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


In [42]:
steam_games.head(5)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [43]:
steam_games.shape

(120445, 13)

In [44]:
# Vemos que tenemos filas completamente nulas, las cuales procederemos a eliminar

steam_games.dropna(how='all', inplace=True)

In [45]:
steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32135 entries, 88310 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 3.4+ MB


In [46]:
# Pasaremos a buscar mas nulos
steam_games[steam_games['id'].isnull()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,False,,
119271,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,False,,"Rocksteady Studios,Feral Interactive (Mac)"


In [109]:
# Vems si la filla con 'id' nulo pero que tiene valores se duplica en nuestro dataframe
steam_games[steam_games['app_name'] == 'Batman: Arkham City - Game of the Year Edition']

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
89378,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure, Open World, Batman, Stealt...",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260/Batma...,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",http://steamcommunity.com/app/200260/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",19.99,False,200260,"Rocksteady Studios,Feral Interactive (Mac)"


Vemos que la fila en cuestion es un duplicado de otra fila, la cual si tiene el id correspondiente por lo cual pasamos a eliminar las filas con un id nulo.

In [111]:
steam_games.dropna(subset=['id'], inplace=True)

Vamos a seguir trabajando con la columna id, esta vez viendo si encontramos algun valor duplicado.

In [112]:
dup = steam_games['id'].value_counts() > 1
steam_games[steam_games['id'].isin(dup[dup].index)]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer


In [113]:
# Encontramos 2 filas duplicadas por lo cual pasamos a eliminar una
steam_games.drop_duplicates(subset='id', inplace=True, keep='first')
steam_games.shape

(32131, 13)

Pasamos a verificar si aún seguimos con nulos en nuestro DataFrame

In [114]:
steam_games.isnull().sum()

publisher       8050
genres           138
app_name           0
title           2048
url                0
release_date    2066
tags             162
reviews_url        0
specs            669
price           1376
early_access       0
id                 0
developer       3297
dtype: int64

In [117]:
steam_games[steam_games['app_name'].isnull()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer


In [118]:
# Al ver que esta fila posee nulos para developer, app_name, title ni publisher la eliminaremos
steam_games.dropna(subset='app_name', inplace=True)

Verificamos la columna genres si tiene valores faltantes


In [119]:
steam_games['genres'].isnull().sum()

138

In [120]:
steam_games[steam_games['developer'].isna()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88314,,"[Action, Indie, Casual, Sports]",Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,
88321,,[Casual],Icarus Six Sixty Six,,http://store.steampowered.com/app/724910/Icaru...,,[Casual],http://steamcommunity.com/app/724910/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",Free,False,724910,
88329,,"[Early Access, Indie, VR]",After Life VR,,http://store.steampowered.com/app/772590/After...,,"[Early Access, Indie, VR]",http://steamcommunity.com/app/772590/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",4.99,True,772590,
88330,,"[Early Access, Action, Adventure, Indie, Casual]",Kitty Hawk,,http://store.steampowered.com/app/640250/Kitty...,,"[Early Access, Action, Adventure, Indie, Casual]",http://steamcommunity.com/app/640250/reviews/?...,"[Single-player, Steam Leaderboards, HTC Vive, ...",2.99,True,640250,
88332,,"[Early Access, Strategy, Action, Indie, Casual...",Mortars VR,,http://store.steampowered.com/app/711440/Morta...,,"[Early Access, Strategy, Action, Indie, Casual...",http://steamcommunity.com/app/711440/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.99,True,711440,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120428,,"[Design & Illustration, Tutorial]",Robotpencil Presents: Exercise: Brushwork,Robotpencil Presents: Exercise: Brushwork,http://store.steampowered.com/app/775640/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/775640/reviews/?...,,3.99,False,775640,
120429,,"[Design & Illustration, Tutorial]",Robotpencil Presents: Creative Composition,Robotpencil Presents: Creative Composition,http://store.steampowered.com/app/777930/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777930/reviews/?...,,3.99,False,777930,
120430,,[Movie],The Gamble House,The Gamble House,http://store.steampowered.com/app/775370/The_G...,2016-11-19,[Movie],http://steamcommunity.com/app/775370/reviews/?...,[Captions available],4.99,False,775370,
120431,,"[Design & Illustration, Tutorial]",Kalen Chock Presents: 2017 Free Tutorial,Kalen Chock Presents: 2017 Free Tutorial,http://store.steampowered.com/app/777950/Kalen...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777950/reviews/?...,,Free,False,777950,


In [121]:
# al ver la columna tags, vemos que tiene valosres similares a la de la columna genres, primero verificamos cuantos nulos posee y si es redituable
# rellenar la columna genres con datos de la columna tags.
steam_games['tags'].isnull().sum()

162

Para rellenar datos en la columna genres, lo que haremos es hacer una lista con las palabras que aparecen en la columna genres, para luego verificar cuales estan en tags y hacer el reemplazo.

In [122]:
# Creando un conjunto de géneros únicos a partir de los datos de géneros no nulos en 'steam_games'
genres = set(item for val in steam_games['genres'].dropna() for item in val)

# Función para actualizar la columna 'genres' utilizando 'tags'
def actualizar_genres(row):
    genres_row = row['genres']
    tags_row = row['tags']
    if isinstance(tags_row, list):
        if isinstance(genres_row, list):
            for tag in tags_row:
                if tag not in genres_row:
                    genres_row.append(tag)
        else:
            genres_row = tags_row  # Si 'genres' es NaN, se reemplaza con 'tags'
    return genres_row

# Actualizar la columna 'genres' con información de 'tags'
steam_games['genres'] = steam_games.apply(actualizar_genres, axis=1)

In [123]:
steam_games['genres'].isnull().sum()

138

Ahora pasaremos a Eliminar filas Las cuales no son redituables para la consultas de las API que vamos a crear luego.


Estas son: publisher, title, url, discount_price, tags, reviews_url, early_access, specs.

In [124]:
steam = steam_games.drop(columns=['publisher', 'title', 'url', 'tags', 'reviews_url', 'early_access'])

In [125]:
steam.head(10)

Unnamed: 0,genres,app_name,release_date,specs,price,id,developer
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,[Single-player],4.99,761140,Kotoshiro
88311,"[Free to Play, Indie, RPG, Strategy, Card Game...",Ironbound,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,643980,Secret Level SRL
88312,"[Casual, Free to Play, Indie, Simulation, Spor...",Real Pool 3D - Poolians,2017-07-24,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,670290,Poolians.com
88313,"[Action, Adventure, Casual]",弹炸人2222,2017-12-07,[Single-player],0.99,767400,彼岸领域
88314,"[Action, Indie, Casual, Sports]",Log Challenge,,"[Single-player, Full controller support, HTC V...",2.99,773570,
88315,"[Action, Adventure, Simulation, FPS, Shooter, ...",Battle Royale Trainer,2018-01-04,"[Single-player, Steam Achievements]",3.99,772540,Trickjump Games Ltd
88316,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Basic Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",9.99,774276,Poppermost Productions
88317,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Pro Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",18.99,774277,Poppermost Productions
88318,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Legend Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",29.99,774278,Poppermost Productions
88319,"[Casual, Indie, Racing, Simulation]",Race,2018-01-04,"[Single-player, Multi-player, Partial Controll...",,768800,RewindApp


La columna Price tiene algunos valores como Free To Play, vamos a cambiarlo por valores de 0.00 

In [126]:
steam.loc[steam['price'].str.contains("Free", na=False), 'price'] = 0.0

In [127]:
steam.head(10)

Unnamed: 0,genres,app_name,release_date,specs,price,id,developer
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,[Single-player],4.99,761140,Kotoshiro
88311,"[Free to Play, Indie, RPG, Strategy, Card Game...",Ironbound,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",0.0,643980,Secret Level SRL
88312,"[Casual, Free to Play, Indie, Simulation, Spor...",Real Pool 3D - Poolians,2017-07-24,"[Single-player, Multi-player, Online Multi-Pla...",0.0,670290,Poolians.com
88313,"[Action, Adventure, Casual]",弹炸人2222,2017-12-07,[Single-player],0.99,767400,彼岸领域
88314,"[Action, Indie, Casual, Sports]",Log Challenge,,"[Single-player, Full controller support, HTC V...",2.99,773570,
88315,"[Action, Adventure, Simulation, FPS, Shooter, ...",Battle Royale Trainer,2018-01-04,"[Single-player, Steam Achievements]",3.99,772540,Trickjump Games Ltd
88316,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Basic Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",9.99,774276,Poppermost Productions
88317,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Pro Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",18.99,774277,Poppermost Productions
88318,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Legend Pass,2018-01-04,"[Single-player, Multi-player, Online Multi-Pla...",29.99,774278,Poppermost Productions
88319,"[Casual, Indie, Racing, Simulation]",Race,2018-01-04,"[Single-player, Multi-player, Partial Controll...",,768800,RewindApp


Vemos los Datos de tipo string que tenemos en la columna price

In [128]:
# Crear un filtro para seleccionar las filas donde el valor en la columna 'price' es una cadena de texto
filtro_str_price = steam['price'].apply(lambda x: isinstance(x, str))

# Obtener los valores de la columna 'price' que son cadenas de texto
valores_str_price = steam.loc[filtro_str_price, 'price']

# Convertir los valores en una cadena separada por comas para imprimir
valores_str_price_str = ', '.join(valores_str_price)

print(valores_str_price_str)

Install Now, Play WARMACHINE: Tactics Demo, Install Theme, Third-party, Play Now, Play the Demo, Starting at $499.00, Starting at $449.00, Play Now, Third-party


Tenemos valores con precios, como starting at $499.00, los cuales transformamos a solo 449.00 y otros que vamos a ctranformar a 0.00 que son: Install Now, Play WARMACHINE: Tactics Demo, Play Now, Play Now, Install Theme, Third-party.

In [129]:
# Reemplazar los valores que indican un precio inicial con el precio real
steam['price'].replace('Starting at $499.00', 499.0, inplace=True)
steam['price'].replace('Starting at $449.00', 449.0, inplace=True)

# Reemplazar los valores que no indican un precio con 0.00
valores_a_cero = ['Install Now', 'Play WARMACHINE: Tactics Demo', 'Play Now', 'Install Theme', 'Third-party']
steam['price'].replace(valores_a_cero, 0.0, inplace=True)

Se eliminan los valores nulos que restan en price

In [130]:
# Eliminar las filas con valores nulos en la columna 'price'
steam.dropna(subset=['price'], inplace=True)


In [131]:
steam.isnull().sum()

genres           117
app_name           0
release_date    1936
specs            655
price              0
id                 0
developer       3156
dtype: int64

Ahora trabajaremos con la columna realease_date

In [132]:
# Filtrar las filas con fechas que no tienen el formato 'YYYY-MM-DD'
invalid_dates = steam[~steam['release_date'].astype(str).str.match(r'^\d{4}-\d{2}-\d{2}$')]

# Mostrar las fechas que no tienen el formato 'YYYY-MM-DD'
print(invalid_dates['release_date'])

88314     NaN
88321     NaN
88329     NaN
88330     NaN
88332     NaN
         ... 
120381    NaN
120383    NaN
120386    NaN
120387    NaN
120444    NaN
Name: release_date, Length: 1975, dtype: object


In [133]:
# Filtrar las filas con fechas que tienen el formato incorrecto de 'YYYY-MM-DD'
valid_dates = steam[steam['release_date'].astype(str).str.match(r'^\d{4}-\d{2}-\d{2}$')]

# Imprimir las filas con fechas que tienen el formato correcto
print(valid_dates['release_date'])

# Eliminar las filas con fechas que tienen el formato incorrecto
steam = steam[steam['release_date'].astype(str).str.match(r'^\d{4}-\d{2}-\d{2}$')]

88310     2018-01-04
88311     2018-01-04
88312     2017-07-24
88313     2017-12-07
88315     2018-01-04
             ...    
120439    2018-01-04
120440    2018-01-04
120441    2018-01-04
120442    2018-01-04
120443    2017-09-02
Name: release_date, Length: 28780, dtype: object


Ahora nos vamos a eliminar los nulos que nos quedan en el dataFrame

In [134]:
# Eliminar todas las filas que contienen valores nulos en cualquier columna
steam.dropna(inplace=True)


In [135]:
steam.shape

(27181, 7)

En cuanto a la columna release_date, La vamos a reemplazar por una columna llamada año, en la cual solo dejaremos los años de lanzamiento, ya que para las consultas solo nos piden el año.

In [136]:
# Extraer los años de la columna 'release_date' y crear una nueva columna 'año'
steam['año'] = steam['release_date'].str.extract(r'(\d{4})')

# Mostrar las primeras filas del DataFrame con la nueva columna 'año'
steam[['release_date', 'año']].head()


Unnamed: 0,release_date,año
88310,2018-01-04,2018
88311,2018-01-04,2018
88312,2017-07-24,2017
88313,2017-12-07,2017
88315,2018-01-04,2018


In [137]:
# Eliminar la columna 'release_date'
steam.drop(columns=['release_date'], inplace=True)


In [138]:
steam.sample(10)

Unnamed: 0,genres,app_name,specs,price,id,developer,año
90670,"[Action, Indie, Platformer, Anime, Female Prot...",Momodora III,"[Single-player, Steam Achievements, Steam Trad...",1.99,302790,rdein,2014
107595,"[Action, Casual, Indie, Massively Multiplayer,...",Evolvation,"[Multi-player, Online Multi-Player, Steam Achi...",9.99,510840,HyperReuts,2017
114861,[Indie],Ongaku Guy Fawkes Pack,"[Single-player, Downloadable Content, Full con...",0.99,369910,SmashMouth Games Ltd,2015
91681,"[Action, Adventure, Free to Play, FPS, Sci-fi,...",Half-Life 2: Update,"[Single-player, Steam Achievements, Captions a...",0.0,290930,"Filip Victor,Valve",2015
102044,"[Indie, RPG, Strategy, Turn-Based, Fantasy, On...",Fantasy Grounds - The Last Parsec: Leviathan (...,"[Multi-player, Co-op, Cross-Platform Multiplay...",9.99,757760,"SmiteWorks USA, LLC",2017
110226,"[Simulation, Strategy]","Command LIVE - You Brexit, You Fix it!","[Single-player, Downloadable Content, Steam Wo...",2.99,497611,WarfareSims,2016
119079,[Action],Gotham City Impostors Free to Play: Gadget Pac...,"[Multi-player, Downloadable Content, Steam Ach...",3.99,216438,"Monolith Productions, Inc.",2012
119286,"[RPG, Indie, Isometric, Turn-Based, Fantasy]",Avernum 5,[Single-player],4.99,206040,Spiderweb Software,2012
98262,"[Casual, Indie]",Tricky Towers - Holographic Bricks,"[Single-player, Multi-player, Online Multi-Pla...",0.99,570362,WeirdBeard,2017
91907,"[Casual, Simulation]",Rocksmith® 2014 – Dethklok Song Pack,"[Single-player, Shared/Split Screen, Downloada...",7.99,342825,Ubisoft - San Francisco,2015


In [139]:
# Obtener el tamaño en bytes del DataFrame
tamaño_bytes = steam.memory_usage().sum()

# Convertir el tamaño a kilobytes dividiendo por 1024
tamaño_kb = tamaño_bytes / 1024

print("Tamaño del DataFrame:", tamaño_kb, "KB")

Tamaño del DataFrame: 1698.8125 KB


In [140]:
steam.rename(columns={'id': 'item_id'}, inplace=True)

Ahora Exportaremos el CSV del ETL

In [141]:
steam.to_csv('../datasets/steam_games.csv',index=False)

## User Items

Cargamos el DataFrame a usar


In [77]:
# Leer el archivo línea por línea y almacenar los resultados en una lista
filas = list()
with open("../data/australian_users_items.json", "r", encoding="Latin-1") as archivo:
    for linea in archivo.readlines():
        filas.append(ast.literal_eval(linea))

steam_items = pd.DataFrame(filas)
steam_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [78]:
# Exploramos la columna items y restablecemos el indice
steam_items = steam_items.explode('items').reset_index()
steam_items = steam_items.drop(columns='index')
steam_items.head(5)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '10', 'item_name': 'Counter-Strike..."
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '20', 'item_name': 'Team Fortress ..."
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '30', 'item_name': 'Day of Defeat'..."
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '40', 'item_name': 'Deathmatch Cla..."
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'item_id': '50', 'item_name': 'Half-Life: Opp..."


In [79]:
# Normalizar la columna 'items' y agregar las columnas resultantes al DataFrame original
steam_items = pd.concat([steam_items.drop(columns=['items']), pd.json_normalize(steam_items['items'])], axis=1)
steam_items


Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...,...
5170010,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,373330,All Is Dust,0.0,0.0
5170011,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,388490,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,521570,You Have 10 Seconds 2,4.0,4.0
5170013,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,519140,Minds Eyes,3.0,3.0


Ahora ya con todo extraido en el dataframe, empezaremos a ver con detalle el dataframe, buscando nulos y duplicados

In [80]:
steam_items.shape

(5170015, 8)

In [81]:
steam_items.isnull().sum()

user_id                 0
items_count             0
steam_id                0
user_url                0
item_id             16806
item_name           16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

In [82]:
# Encuentra las filas duplicadas en el DataFrame
filas_duplicadas = steam_items[steam_items.duplicated()]

# Obtiene la cantidad de filas duplicadas
cantidad_duplicados = filas_duplicadas.shape[0]

# Muestra la cantidad de filas duplicadas
print("Cantidad de filas duplicadas:", cantidad_duplicados)


Cantidad de filas duplicadas: 59196


Después de encontrar estas filas duplicadas, vamos a eliminarlas

In [83]:
steam_items_sd = steam_items.drop_duplicates()

In [84]:
steam_items_sd.shape

(5110819, 8)

In [85]:
# Encuentra las filas con valores nulos
filas_con_nulos = steam_items[steam_items.isnull().any(axis=1)]

# Muestra las primeras filas con valores nulos
filas_con_nulos.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
3733,Wackky,0,76561198039117046,http://steamcommunity.com/id/Wackky,,,,
3849,76561198079601835,0,76561198079601835,http://steamcommunity.com/profiles/76561198079...,,,,
6019,hellom8o,0,76561198117222320,http://steamcommunity.com/id/hellom8o,,,,
6523,starkillershadow553,0,76561198059648579,http://steamcommunity.com/id/starkillershadow553,,,,
7237,darkenkane,0,76561198058876001,http://steamcommunity.com/id/darkenkane,,,,


In [86]:
steam_items_final = steam_items_sd.dropna()

In [87]:
steam_items_final.shape

(5094105, 8)

Eliminaremos las columnas que no son relevantes

In [88]:
# Eliminar las columnas especificadas
steam_items_final = steam_items_final.drop(columns=['items_count', 'playtime_2weeks', 'user_url', 'item_name'])
steam_items_final.head(5)

Unnamed: 0,user_id,steam_id,item_id,playtime_forever
0,76561197970982479,76561197970982479,10,6.0
1,76561197970982479,76561197970982479,20,0.0
2,76561197970982479,76561197970982479,30,7.0
3,76561197970982479,76561197970982479,40,0.0
4,76561197970982479,76561197970982479,50,0.0


In [89]:
# Contar las filas con valores 0 en la columna playtime_forever
cantidad_filas_con_0 = len(steam_items_final[steam_items_final['playtime_forever'] == 0])

cantidad_filas_con_0

1847730

Para las consultas de las API, nos piden mayoria de tiempo para estos juegos, por lo que, como no son relevantes para el MVP los eliminaremos

In [90]:
# Eliminar las filas con valor 0 en la columna playtime_forever
steam_items_final = steam_items_final[steam_items_final['playtime_forever'] != 0]

In [91]:
steam_items_final['playtime_forever'].describe()

count    3.246375e+06
mean     1.554340e+03
std      6.717381e+03
min      1.000000e+00
25%      4.400000e+01
50%      2.050000e+02
75%      8.080000e+02
max      6.427730e+05
Name: playtime_forever, dtype: float64

Ahora lo que haremos Convertiremos los valores de playtime_forever a horas, guardandolo en una nuevo columna y eliminando la playtime_forever

In [92]:
# Convertir minutos a horas y guardar en una nueva columna
steam_items_final['horas'] = steam_items_final['playtime_forever'] / 60

# Mostrar las primeras filas del DataFrame con la nueva columna
print(steam_items_final.head())

              user_id           steam_id item_id  playtime_forever      horas
0   76561197970982479  76561197970982479      10               6.0   0.100000
2   76561197970982479  76561197970982479      30               7.0   0.116667
8   76561197970982479  76561197970982479     300            4733.0  78.883333
9   76561197970982479  76561197970982479     240            1853.0  30.883333
10  76561197970982479  76561197970982479    3830             333.0   5.550000


In [93]:
steam_items_final.drop(columns=['playtime_forever'], inplace=True)

In [94]:
steam_items_final.shape

(3246375, 4)

Ahora ya con todo listo, exportaremos el dataFrame a un archivo

In [95]:
steam_items_final.to_parquet('../datasets/users_items.parquet', index=False)

## User Reviews

Cargamos el dataFrame de users_reviews

In [96]:
filas = []
with open("../data/australian_user_reviews.json", "r", encoding="Latin-1") as archivo:
    for linea in archivo.readlines():
        filas.append(ast.literal_eval(linea))

steam_reviews = pd.DataFrame(filas)
steam_reviews.head(5)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


Vamos a hacer un tratamiento parecido al de user_items, con la columna items


In [97]:
# Exploramos la columna items y restablecemos el indice
steam_reviews = steam_reviews.explode('reviews').reset_index()
steam_reviews = steam_reviews.drop(columns='index')
steam_reviews.head(5)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
3,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
4,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."


In [98]:
# Normalizar la columna 'items' y agregar las columnas resultantes al DataFrame original
steam_reviews = pd.concat([steam_reviews.drop(columns=['reviews']), pd.json_normalize(steam_reviews['reviews'])], axis=1)
steam_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


Examinamos el Datafreame buscando las filas con valores nulos

In [99]:
steam_reviews.isnull().sum()

user_id         0
user_url        0
funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

In [100]:
# Eliminar filas con valores nulos
steam_reviews = steam_reviews.dropna()

# Verificar si se eliminaron correctamente los valores nulos
steam_reviews.isnull().sum()

user_id        0
user_url       0
funny          0
posted         0
last_edited    0
item_id        0
helpful        0
recommend      0
review         0
dtype: int64

Buscamos valores duplicados en el dataframe y los eliminamos

In [101]:
# Buscar registros duplicados
duplicados = steam_reviews[steam_reviews.duplicated()]

# Eliminar registros duplicados
df_UserReviews_sin_duplicados = steam_reviews.drop_duplicates()

# Verificar la forma del DataFrame después de eliminar los duplicados
df_UserReviews_sin_duplicados.shape


(58431, 9)

antes de hacer el analisis de sentimiento, vamos a eliminar las columnas que no son relevantes

In [102]:
steam_reviews_copy = steam_reviews.copy()
steam_reviews_copy.drop(columns=['user_url','funny','helpful','last_edited'], inplace=True)

In [103]:
steam_reviews_copy

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
2,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
4,js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
59328,76561198312638244,Posted July 10.,70,True,a must have classic from steam definitely wort...
59329,76561198312638244,Posted July 8.,362890,True,this game is a perfect remake of the original ...
59330,LydiaMorley,Posted July 3.,273110,True,had so much fun plaing this and collecting res...
59331,LydiaMorley,Posted July 20.,730,True,:D


Ahora empezaremos con la parte del analisis de sentimiento para las reviews de los usuarios

In [104]:
# Descargar el lexicon de VADER
nltk.download('vader_lexicon')

# Crear una instancia del analizador de sentimientos
model_sentimiento = SentimentIntensityAnalyzer()

def analizador(review):
    # Obtener el puntaje de sentimiento usando SentimentIntensityAnalyzer
    sentimiento_score = model_sentimiento.polarity_scores(review)
    
    # Clasificar el sentimiento
    if review and not pd.isnull(review):
        compound_score = sentimiento_score['compound']
        if compound_score >= 0.5:
            return 2  # Sentimiento positivo
        elif compound_score <= -0.5:
            return 0  # Sentimiento negativo
    return 1  # Sentimiento neutral si no se cumple ninguna de las condiciones anteriores

[nltk_data] Downloading package vader_lexicon to C:\Users\Mi
[nltk_data]     Equipo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [105]:
# Crear una copia del DataFrame para evitar la advertencia SettingWithCopyWarning
user_review_explode_copy = steam_reviews_copy.copy()

# Aplicar la función de análisis de sentimiento y asignar el resultado a una nueva columna
user_review_explode_copy['sentiment_analysis'] = user_review_explode_copy['review'].apply(analizador)

# Eliminar la columna 'review' del DataFrame copiado
user_review_explode_copy.drop(columns='review', inplace=True)

In [106]:
user_review_explode_copy

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,"Posted November 5, 2011.",1250,True,2
1,76561197970982479,"Posted July 15, 2011.",22200,True,1
2,76561197970982479,"Posted April 21, 2011.",43110,True,2
3,js41637,"Posted June 24, 2014.",251610,True,2
4,js41637,"Posted September 8, 2013.",227300,True,2
...,...,...,...,...,...
59328,76561198312638244,Posted July 10.,70,True,2
59329,76561198312638244,Posted July 8.,362890,True,2
59330,LydiaMorley,Posted July 3.,273110,True,2
59331,LydiaMorley,Posted July 20.,730,True,2


In [107]:
conteo_sentimientos = user_review_explode_copy['sentiment_analysis'].value_counts()
print(conteo_sentimientos)

sentiment_analysis
2    28418
1    25885
0     5002
Name: count, dtype: int64


Ahora ya que tenemos las columnas necesarias y el analisis de sentimiento procederemos a exportar el dataframe

In [108]:
user_review_explode_copy.to_parquet('../datasets/user_reviews.parquet',index=False)