# ETL y EDA archivo user_reviews

In [2]:
# importar módulos
from ast import literal_eval
import datetime as dt
import pandas as pd
import fastparquet
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_colwidth', None)

In [4]:
# cargar el dataset
df_users_reviews = pd.read_parquet("archivos_originales//df_user_reviews.parquet")

In [5]:
# visualizar contenido
df_users_reviews.head(2)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'helpful': 'No ratings yet', 'i..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'helpful': '15 of 20 people (75..."


In [None]:
# función para obtener datos varios de los dataframe(info, describe, null, duplicated)
def print_df_info(df):

    print("** Info - Información general **")
    print(df.info())

    print("=======================================================")

    print("** Shape **")
    print(f"Filas: {df.shape[0]} - Columnas: {df.shape[1]}")

    print("=======================================================")

    # print("** Describe - Estadísticas resumidas **")
    # print(df.describe().T)

    # print("=======================================================")

    print("** NaN - Nulos **")
    print(df.isna().sum())

    # print("=======================================================")
    # print("** Duplicated - Duplicados **")
    # print(df.duplicated().sum())

In [None]:
# se suprimieron describe() y dplicated() por presentar errores ya que las columnas son todas object
print_df_info(df_users_reviews)

** Info - Información general **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB
None
** Shape **
Filas: 25799 - Columnas: 3
** NaN - Nulos **
user_id     0
user_url    0
reviews     0
dtype: int64


In [None]:
# visualizar los tipos de datos de las columnas
df_users_reviews.dtypes

user_id     object
user_url    object
reviews     object
dtype: object

In [None]:
# visualizar las columnas
df_users_reviews.columns

Index(['user_id', 'user_url', 'reviews'], dtype='object')

In [6]:
# función para desanidar y separar en columnas los datos de la columna reviews
def extract_reviews_data(df):
    reviews = df['reviews'].explode()
    df = df.drop('reviews', axis=1)
    df = df.join(reviews.apply(pd.Series))
    return df

In [7]:
# aplicar la función y crear el dataframe
df_users_reviews = extract_reviews_data(df_users_reviews.copy())

In [8]:
# visualizar el dataframe creado
df_users_reviews.head(3)

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...,


In [None]:
df_users_reviews[df_users_reviews['posted'] == "Posted November 5, 2011."]

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,
10385,MonsOlympus,http://steamcommunity.com/id/MonsOlympus,,No ratings yet,620,,"Posted November 5, 2011.",True,"Awesomely well done game, solid fun and a fant...",
14682,xychome,http://steamcommunity.com/id/xychome,,No ratings yet,550,,"Posted November 5, 2011.",True,IT'S TIME TO LEFT 4 ALIVE 2,


In [9]:
# resetear el índice
df_users_reviews = df_users_reviews.reset_index(drop=True)

In [10]:
# Elimina la palabra "Posted" de la columna posted de user_reviews, los espacios al
# principio y al final, las comas y puntos y separamos con un guión
df_users_reviews['posted'] = df_users_reviews['posted'].str.strip()
df_users_reviews['posted'] = df_users_reviews['posted'].str[7:]
df_users_reviews['posted'] = df_users_reviews['posted'].str.replace(",","")
df_users_reviews['posted'] = df_users_reviews['posted'].str.replace(".","")
df_users_reviews['posted'] = df_users_reviews['posted'].str.replace(" ","-")

  df_users_reviews['posted'] = df_users_reviews['posted'].str.replace(".","")


In [11]:
df_users_reviews["posted"]

0         November-5-2011
1            July-15-2011
2           April-21-2011
3            June-24-2014
4        September-8-2013
               ...       
59328             July-10
59329              July-8
59330              July-3
59331             July-20
59332              July-2
Name: posted, Length: 59333, dtype: object

In [12]:
# visualizar datos
df_users_reviews[df_users_reviews['posted'] == "November-5-2011"]

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,November-5-2011,True,Simple yet with great replayability. In my opi...,
26273,MonsOlympus,http://steamcommunity.com/id/MonsOlympus,,No ratings yet,620,,November-5-2011,True,"Awesomely well done game, solid fun and a fant...",
36166,xychome,http://steamcommunity.com/id/xychome,,No ratings yet,550,,November-5-2011,True,IT'S TIME TO LEFT 4 ALIVE 2,


In [13]:
# crear columnas mes, dia y año a partir de la columna posted
df_users_reviews[["month", "day", "year"]] = df_users_reviews["posted"].str.split('-', n=2, expand=True)

In [15]:
# Diccionario de correspondencia entre meses y números
month_to_number = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'
}

In [16]:
# Usar .map() para transformar los nombres de los meses en números de dos dígitos
df_users_reviews['number_month'] = df_users_reviews['month'].map(month_to_number)

In [17]:
# verificar el resultado
df_users_reviews.head()

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0,month,day,year,number_month
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,November-5-2011,True,Simple yet with great replayability. In my opi...,,November,5,2011,11
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,22200,,July-15-2011,True,It's unique and worth a playthrough.,,July,15,2011,7
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,43110,,April-21-2011,True,Great atmosphere. The gunplay can be a bit chu...,,April,21,2011,4
3,js41637,http://steamcommunity.com/id/js41637,,15 of 20 people (75%) found this review helpful,251610,,June-24-2014,True,I know what you think when you see this title ...,,June,24,2014,6
4,js41637,http://steamcommunity.com/id/js41637,,0 of 1 people (0%) found this review helpful,227300,,September-8-2013,True,For a simple (it's actually not all that simpl...,,September,8,2013,9


In [18]:
# verificar nulos en la columna year
print(df_users_reviews["year"].isnull().sum())
df_users_reviews.shape

10147


(59333, 14)

In [19]:
# verifico cual es el año que mas repite
df_users_reviews["year"].mode()

0    2014
Name: year, dtype: object

In [20]:
# considerando el tamaño del conjunto de datos, se decide imputar el año mas repetido
df_users_reviews["year"].fillna(df_users_reviews["year"].mode()[0], inplace=True)

In [21]:
# verifico
df_users_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user_id       59333 non-null  object 
 1   user_url      59333 non-null  object 
 2   funny         59305 non-null  object 
 3   helpful       59305 non-null  object 
 4   item_id       59305 non-null  object 
 5   last_edited   59305 non-null  object 
 6   posted        59305 non-null  object 
 7   recommend     59305 non-null  object 
 8   review        59305 non-null  object 
 9   0             0 non-null      float64
 10  month         59305 non-null  object 
 11  day           59305 non-null  object 
 12  year          59333 non-null  object 
 13  number_month  59305 non-null  object 
dtypes: float64(1), object(13)
memory usage: 6.3+ MB


In [22]:
# cambiar el tipo de dato de recommend, para esto veo el tipo de dato
print(df_users_reviews["recommend"].dtype)

# para comprobacíon de cuantos valores, cuantos únicos y la fecuencia
df_users_reviews["recommend"].describe()

object


count     59305
unique        2
top        True
freq      52473
Name: recommend, dtype: object

In [23]:
# conteo de nulos
df_users_reviews.isnull().sum()

user_id             0
user_url            0
funny              28
helpful            28
item_id            28
last_edited        28
posted             28
recommend          28
review             28
0               59333
month              28
day                28
year                0
number_month       28
dtype: int64

In [24]:
# eliminar nulos
df_users_reviews.dropna(subset=["recommend"], inplace = True)

In [None]:
# verificar eliminación de nulos
df_users_reviews.isnull().sum()

user_id             0
user_url            0
funny               0
helpful             0
item_id             0
last_edited         0
posted              0
recommend           0
review              0
0               59305
month               0
day                 0
year                0
number_month        0
dtype: int64

In [25]:
# cambiar los valores de "True" a "1" y "False" a "0"
df_users_reviews["recommend"] = df_users_reviews["recommend"].astype(str)
df_users_reviews["recommend"] = df_users_reviews["recommend"].str.replace("True","1")
df_users_reviews["recommend"] = df_users_reviews["recommend"].str.replace("False","0")

In [26]:
# cambio del tipo de datos de str a int
df_users_reviews["recommend"] = df_users_reviews["recommend"].astype(int)

In [27]:
# verifico el cambio
df_users_reviews["recommend"].dtype

dtype('int64')

In [30]:
# Función lambda para combinar las columnas en una fecha
df_users_reviews['date_posted'] = df_users_reviews.apply(lambda row: f"{row['year']}-{row['number_month']}-{row['day']}", axis=1)

In [31]:
# verificar la combinación y creación de la columna
df_users_reviews.head(3)

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0,month,day,year,number_month,date_posted
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,November-5-2011,1,Simple yet with great replayability. In my opi...,,November,5,2011,11,2011-11-5
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,22200,,July-15-2011,1,It's unique and worth a playthrough.,,July,15,2011,7,2011-07-15
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,43110,,April-21-2011,1,Great atmosphere. The gunplay can be a bit chu...,,April,21,2011,4,2011-04-21


Al convertir la columna a fecha, se presentan errores de filas con datos no aptos,  
para pasar a fecha, la fecha 29 de febrero se detecta como mal dato, según el error  
ese año no fue bisiesto.  

Se decide eliminar las fechas no aptas ya que los datos malformados como 2014-nan-None, sus otras columnas  
son mayormente nan, para el dato Febrero 29, se decide eliminar ya que son pocos y no se tiene certeza si  
fue un error de ingreso de datos

In [32]:
df_users_reviews = df_users_reviews.query("date_posted != '2014-nan-None'")
df_users_reviews = df_users_reviews.query("date_posted != '2014-02-29'")

In [33]:
# cambiar tipo de dato de la columna date_posted a date
df_users_reviews['date_posted'] = pd.to_datetime(df_users_reviews['date_posted'])

In [34]:
# verificar los datos
df_users_reviews.head(2)

Unnamed: 0,user_id,user_url,funny,helpful,item_id,last_edited,posted,recommend,review,0,month,day,year,number_month,date_posted
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,1250,,November-5-2011,1,Simple yet with great replayability. In my opi...,,November,5,2011,11,2011-11-05
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,No ratings yet,22200,,July-15-2011,1,It's unique and worth a playthrough.,,July,15,2011,7,2011-07-15


In [37]:
df_users_reviews.dtypes

user_id                 object
user_url                object
funny                   object
helpful                 object
item_id                 object
last_edited             object
posted                  object
recommend                int64
review                  object
0                      float64
month                   object
day                     object
year                    object
number_month            object
date_posted     datetime64[ns]
dtype: object

In [64]:
# visualizo una fecha aleatoria
df_users_reviews[df_users_reviews['date_posted'] == "2011-11-05"]

Unnamed: 0,user_id,user_url,item_id,recommend,review,date_posted,sentiment
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,1,Simple yet with great replayability. In my opi...,2011-11-05,1
26273,MonsOlympus,http://steamcommunity.com/id/MonsOlympus,620,1,"Awesomely well done game, solid fun and a fant...",2011-11-05,1
36166,xychome,http://steamcommunity.com/id/xychome,550,1,IT'S TIME TO LEFT 4 ALIVE 2,2011-11-05,1


In [57]:
df_users_reviews[df_users_reviews['user_id'] == "76561197970982479"]

Unnamed: 0,user_id,user_url,item_id,recommend,review,date_posted,sentiment
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,1,Simple yet with great replayability. In my opi...,2011-11-05,1
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,1,It's unique and worth a playthrough.,2011-07-15,1
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,1,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,1


In [62]:
df_users_reviews.shape

(58406, 7)

In [60]:
df_users_reviews.drop_duplicates(inplace=True)

In [61]:
df_users_reviews.duplicated().sum()

0

In [63]:
df_users_reviews.isnull().sum()

user_id        0
user_url       0
item_id        0
recommend      0
review         0
date_posted    0
sentiment      0
dtype: int64

In [None]:
# visualizar columnas
df_users_reviews.columns

Index([     'user_id',     'user_url',        'funny',      'helpful',
            'item_id',  'last_edited',       'posted',    'recommend',
             'review',              0,        'month',          'day',
               'year', 'number_month',  'date_posted'],
      dtype='object')

In [40]:
# eliminar columnas innecesarias
df_users_reviews = df_users_reviews.drop(
    [
        'funny', 'helpful', 'last_edited', 'posted', 0, 'month', 'day', 'year', 'number_month'
    ], axis=1)

In [41]:
# verifico los datos
df_users_reviews.head(2)

Unnamed: 0,user_id,user_url,item_id,recommend,review,date_posted
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,1,Simple yet with great replayability. In my opi...,2011-11-05
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,1,It's unique and worth a playthrough.,2011-07-15


In [None]:
# resúmen del dataframe
df_users_reviews.info()
df_users_reviews.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59280 entries, 0 to 59332
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      59280 non-null  object        
 1   user_url     59280 non-null  object        
 2   item_id      59280 non-null  object        
 3   recommend    59280 non-null  int64         
 4   review       59280 non-null  object        
 5   date_posted  59280 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 3.2+ MB


user_id        0
user_url       0
item_id        0
recommend      0
review         0
date_posted    0
dtype: int64

In [None]:
df_users_reviews_2 = df_users_reviews.to_parquet("archivos_parquet/df_users_reviews_2.parquet")

**Comienzo creación columnas análisis de sentimientos**

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
#nltk.download('stopwords')

In [None]:
# ver distribución de los datos en recommend
fig = px.histogram(df_users_reviews, x = "recommend")
fig.update_layout(title_text = "Recomendación del Juego")
fig.show()

In [43]:
# import nltk para análisis de sentimientos
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

**Descargar los archivos de datos necesarios para el análisis de sentimientos con VADER.**    
Estos archivos de datos contienen puntuaciones de sentimiento pre-entrenadas para palabras y frases,  
que son utilizadas por el algoritmo VADER para calcular la intensidad del sentimiento de un texto dado.

In [44]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [45]:
# creamos una instancia de la clase
sia = SentimentIntensityAnalyzer()

In [46]:
# función para realizar el análisis de sentimientos
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    compound_score = sentiment['compound']
    if compound_score > 0:
        return 1  # positiva
    elif compound_score < 0:
        return -1  # negativa
    else:
        return 0  # neutra

In [47]:
# aplicar la función sobre la columna review
df_users_reviews['sentiment'] = df_users_reviews['review'].apply(get_sentiment)

In [48]:
df_users_reviews.head()

Unnamed: 0,user_id,user_url,item_id,recommend,review,date_posted,sentiment
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,1,Simple yet with great replayability. In my opi...,2011-11-05,1
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,1,It's unique and worth a playthrough.,2011-07-15,1
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,1,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,1
3,js41637,http://steamcommunity.com/id/js41637,251610,1,I know what you think when you see this title ...,2014-06-24,1
4,js41637,http://steamcommunity.com/id/js41637,227300,1,For a simple (it's actually not all that simpl...,2013-09-08,1


In [None]:
# revisar nulos
df_users_reviews["review"].isnull().sum()

0

In [None]:
# visualizar un dato
print(df_users_reviews.iloc[29, 4:6].to_string())

review         killed the emperor  nobody cared and got away with   accidentally killed  chicken and everybody decided  gang      
date_posted                                                                                                    2015-02-01 00:00:00


**Visualizar datos positivos, negativos y neutros**  
Se visualizaron cada uno de los datos, con esto comprobamos que hay mas reseñas positivas que negativas  
y neutras, ya estos datos se habian obtenido mediante el gráfico 

In [51]:
df_users_reviews[df_users_reviews['sentiment'] == 1]

Unnamed: 0,user_id,user_url,item_id,recommend,review,date_posted,sentiment
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,1,Simple yet with great replayability. In my opi...,2011-11-05,1
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,1,It's unique and worth a playthrough.,2011-07-15,1
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,1,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,1
3,js41637,http://steamcommunity.com/id/js41637,251610,1,I know what you think when you see this title ...,2014-06-24,1
4,js41637,http://steamcommunity.com/id/js41637,227300,1,For a simple (it's actually not all that simpl...,2013-09-08,1
...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,70,1,a must have classic from steam definitely wort...,2014-07-10,1
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,362890,1,this game is a perfect remake of the original ...,2014-07-08,1
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,273110,1,had so much fun plaing this and collecting res...,2014-07-03,1
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,730,1,:D,2014-07-20,1


In [54]:
df_users_reviews.dtypes

user_id                object
user_url               object
item_id                 int64
recommend               int64
review                 object
date_posted    datetime64[ns]
sentiment               int64
dtype: object

In [53]:
# cambiar el tipo de dato
df_users_reviews['item_id'] = df_users_reviews['item_id'].astype(int)

In [65]:
# exportar el archivo
df_users_reviews_2 = df_users_reviews.to_parquet("datasets/df_users_reviews_2.parquet")