Importaciones y configuración inicial

In [28]:
import pandas as pd
import numpy as np
import json
from datetime import datetime

# Configuración general
SEED = 5200  # semilla 
N = 5200    # el último dígito de la cédula es 2
RAW_PATH = "../data/googleplaystore.csv"
SUBSET_PATH = f"../data/subset/subset_googleplaystore_rows{N}_seed{SEED}.csv"
CLEANED_SUBSET_PATH = f"../data/subset/cleaned_subset_googleplaystore_rows{N}_seed{SEED}.csv"
META_PATH = "../data/subset/metadata.json"

pd.set_option('display.max_columns', None)

Cargar el dataset original y visualizarlo

In [29]:
df = pd.read_csv(RAW_PATH)
print("Shape original:", df.shape)
df.head()

Shape original: (10841, 13)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


Limpieza básica de la base de datos original

In [30]:
# limpiar valores nulos 
df = df.dropna()

# eliminar duplicados
df = df.drop_duplicates()

# eliminar filas con 'Varies with device' en la columna 'Size'
df = df[df['Size'].str.lower() != 'varies with device']

print("Shape después de limpiar nulos y duplicados:", df.shape)

Shape después de limpiar nulos y duplicados: (7418, 13)


Generación del subset reproducible

In [31]:
# Submuestreo reproducible
subset = df.sample(n=N, random_state=SEED).reset_index(drop=True)

# guardar a CSV
subset.to_csv(SUBSET_PATH, index=False, encoding='utf-8')

# guardar metadata
metadata = {
    "rows": N,
    "seed": SEED,
    "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "source_file": RAW_PATH.split("/")[-1],
}
with open(META_PATH, "w") as f:
    json.dump(metadata, f, indent=4)

print("Subset guardado en:", SUBSET_PATH)
subset.head()

Subset guardado en: ../data/subset/subset_googleplaystore_rows5200_seed5200.csv


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Battery Doctor-Battery Life Saver & Battery Co...,TOOLS,4.5,8190074,17M,"100,000,000+",Free,0,Everyone,Tools,"June 1, 2018",6.24,4.0 and up
1,Keypad Lock Screen,TOOLS,4.2,428581,4.7M,"10,000,000+",Free,0,Everyone,Tools,"July 17, 2018",1.33,4.0 and up
2,[Substratum] K-Manager for K-Klock,PERSONALIZATION,4.5,14,11M,"1,000+",Free,0,Everyone,Personalization,"July 31, 2018",31,7.0 and up
3,Asana: organize team projects,BUSINESS,4.3,20815,10M,"1,000,000+",Free,0,Everyone,Business,"July 26, 2018",6.4.4,5.0 and up
4,Superheroes Wallpapers | 4K Backgrounds,ART_AND_DESIGN,4.7,7699,4.2M,"500,000+",Free,0,Everyone 10+,Art & Design,"July 12, 2018",2.2.6.2,4.0.3 and up


Limpieza en profundidad del subconjunto de datos

In [32]:
# --- Conversión de la fecha 'Last Updated' ---
# Convierte a formato datetime
subset['Last Updated'] = pd.to_datetime(subset['Last Updated'], errors='coerce')

# Calcula los días desde la última actualización hasta hoy
subset['Days_Since_Update'] = (pd.Timestamp.now() - subset['Last Updated']).dt.days

# --- Eliminación de columnas innecesarias ---
subset = subset.drop(columns=['App', 'Last Updated', 'Current Ver', 'Android Ver'], errors='ignore')

# --- Limpieza y conversión de la columna 'Size' ---
def parse_size(value):
    if isinstance(value, str):
        value = value.strip()
        if value.endswith('M'):
            return float(value.replace('M', '').replace(',', '').strip())
        elif value.endswith('k'):
            # Convertir KB a MB
            return float(value.replace('k', '').replace(',', '').strip()) / 1024
    return np.nan

subset['Size_MB'] = subset['Size'].apply(parse_size)
subset.drop(columns=['Size'], inplace=True)

# --- Limpieza y conversión de la columna 'Installs' ---
def parse_installs(value):
    if isinstance(value, str):
        return int(value.replace('+', '').replace(',', '').strip())
    return np.nan

subset['Installs'] = subset['Installs'].apply(parse_installs)

print("Columnas actuales del DataFrame:")
print(subset.columns.tolist())

Columnas actuales del DataFrame:
['Category', 'Rating', 'Reviews', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Days_Since_Update', 'Size_MB']


Guardado de subconjunto de datos completamente limpio y listo para usar

In [33]:
# guardar a CSV
subset.to_csv(CLEANED_SUBSET_PATH, index=False, encoding='utf-8')

subset.head()
print(subset.shape)

(5200, 10)
