# Procesamiento de los datos

Limpieza y transformaciones, la salida estará lista para modelar.

In [1]:
# settings
import pandas as pd
from   itertools import chain

In [2]:
# data path
path_input          = "https://raw.githubusercontent.com/yoselalberto/ia_proyecto_final/main/data/celulares.csv"
path_salida         = 'work/data/processed/celulares_procesados.csv'
# estos datos tienen el formato adecuado para imprimirlos en pantalla:
path_salida_formato = 'work/data/processed/celulares_formato.csv'

In [3]:
# more dependencies
import janitor

In [4]:
# corrigé un error en el formato de los valores de cada instancia
def replace_string(dataframe, string = ','):
  # elimina el caracter molesto
  df = dataframe.copy()
  # column by column
  for columna in df.columns.values:
    df[columna] = df[columna].str.replace(string, '')
  return df 
# lowercase all dataframe
def df_lowercase(dataframe):
  # lowercase all columns
  df = dataframe.copy()
  for columna in df.columns.values:
    df[columna] = df[columna].str.lower()
  return df
# coerse columns
def df_numeric(dataframe, columns):
  df = dataframe.copy()
  df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
  return df
# agrupo las funciones anteriores
def df_clean(dataframe, string, columns_to_numeric):
  df = dataframe.copy()
  # 
  df_2 = replace_string(dataframe, string)
  df_3 = df_lowercase(df_2)
  df_4 = df_numeric(df_3, columns = columns_to_numeric)
  return df_4
# limpieza parcial
def df_clean_parcial(dataframe, string, columns_to_numeric):
  df = dataframe.copy()
  # 
  df_2 = replace_string(dataframe, string)
  df_3 = df_numeric(df_2, columns = columns_to_numeric)
  return df_3

In [5]:
# los pasos los meto en funciones
def clean_tecnologia(dataframe):
    df = dataframe.copy()
    # tabla de soporte
    tabla_tecnologias = pd.DataFrame(
    {'tecnologia' : ['2g/3g/4g/4glte/5g', '4glte',  '4g/gsm', '2g/3g/4g/4glte/gsm', '4g', '5g', '3g/4g/gsm', '4g/4glte/gsm/lte', '2g/3g/lte', '3g/lte'],
    'tecnologia_mejor' : ['5g',            '4glte', '4g',     '4glte',              '4g', '5g', '4g',       '4glte',             '4glte',      '4glte']}
    )
    # sustitución
    df_salida = df.merge(tabla_tecnologias, how = "left").drop(columns = {'tecnologia'}).rename(columns = {'tecnologia_mejor': 'tecnologia'})
    # salida
    return df_salida
    # procesador
def clean_procesador(dataframe):
    df = dataframe.copy()
    #
    df['procesador'] = df.procesador.str.split().str.get(0).str.replace('\d+', '')
    # salida
    return  df
    # clean operative systems
def clean_os(dataframe):
    df = dataframe.copy()
    #
    df['sistema_operativo']= df.sistema_operativo.str.extract(r'(android|ios)', expand = False)
    # salida
    return df
    # chain steps
def df_procesamiento(dataframe):
    df = dataframe.copy()
    # steps 
    df_tecnologia = clean_tecnologia(df)
    df_procesador = clean_procesador(df_tecnologia)
    df_os         = clean_os(df_procesador)
    # resultado
    return df_os

In [6]:
# data loading
df_raw = pd.read_csv(path_input, dtype = 'str').clean_names()

In [7]:
df_raw

Unnamed: 0,marca,nombre_del_producto,color,peso,pantalla,camara_trasera,camara_frontal,procesador,ram,memoria_interna,sistema_operativo,tecnologia,precio
0,Samsung,Galaxy Z Fold2,Negro,"0.282,","AMOLED,",12,10,"Qualcomm1,",12,256,"Android 10,","2G/3G/4G/4GLTE/5G,",46799
1,Samsung,Galaxy Z Fold2,Bronce,"0.282,","AMOLED,",12,10,"Qualcomm1,",12,256,"Android 10,","2G/3G/4G/4GLTE/5G,",46799
2,Apple,iPhone 11 Pro Max,Verde Medianoche,"0.272,","OLED,",12,12,"Apple1,",4,512,"Apple iOS 13,","4GLTE,",33999
3,Apple,iPhone 11 Pro Max,Gris Espacial,"0.272,","OLED,",12,12,"Apple1,",4,512,"Apple iOS 13,","4GLTE,",33999
4,Apple,"iPhone 11 Pro Max,",Plata,"0.272,","OLED,",12,12,"Apple1,",4,512,"Apple iOS 13,","4GLTE,",33999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,"Motorola,","Moto G8 Play,",Gris,"0.188,","TFT-LCD,",8,13,"MediaTek2,",2,32,"Android 9,","4GLTE,",3490
131,"Motorola,","Moto E6 Plus,",Azul,"0.149,","LCD,",13,8,"MediaTek2,",2,32,"Android 8,","4GLTE,",2990
132,"Huawei,","Honor 8A,",Verde,"0.176,","LCD,",13,8,"MediaTek2,",2,32,"Android 9,","4GLTE,",3299
133,"Motorola,","Moto E6 Play,",Azul,"0.190,","IPS-LCD,",13,5,"Qualcomm3,",1,16,"Android 8,","4GLTE,",2499


In [8]:
# renombro columnas
nombres = {"nombre_del_producto": 'producto_nombre', 'memoria_interna': 'memoria'}
df_inicio = df_raw.rename(columns = nombres)

In [9]:
# limpieza inicial
columns_numeric = ['peso', 'camara_trasera', 'camara_frontal', 'ram', 'memoria', 'precio']
# 
df_limpio = df_clean(df_inicio, ',', columns_numeric).drop_duplicates().reset_index(drop = True)

In [10]:
df_limpio

Unnamed: 0,marca,producto_nombre,color,peso,pantalla,camara_trasera,camara_frontal,procesador,ram,memoria,sistema_operativo,tecnologia,precio
0,samsung,galaxy z fold2,negro,0.282,amoled,12,10,qualcomm1,12,256,android 10,2g/3g/4g/4glte/5g,46799
1,samsung,galaxy z fold2,bronce,0.282,amoled,12,10,qualcomm1,12,256,android 10,2g/3g/4g/4glte/5g,46799
2,apple,iphone 11 pro max,verde medianoche,0.272,oled,12,12,apple1,4,512,apple ios 13,4glte,33999
3,apple,iphone 11 pro max,gris espacial,0.272,oled,12,12,apple1,4,512,apple ios 13,4glte,33999
4,apple,iphone 11 pro max,plata,0.272,oled,12,12,apple1,4,512,apple ios 13,4glte,33999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,motorola,moto e6 play,negro,0.190,ips-lcd,13,8,qualcomm3,1,16,android 8,4glte,2499
123,motorola,moto e6 plus,azul,0.149,lcd,13,8,mediatek2,2,32,android 8,4glte,2990
124,huawei,honor 8a,verde,0.176,lcd,13,8,mediatek2,2,32,android 9,4glte,3299
125,motorola,moto e6 play,azul,0.190,ips-lcd,13,5,qualcomm3,1,16,android 8,4glte,2499


In [11]:
# transformación de las columnas
df_procesado = df_procesamiento(df_limpio)

In [12]:
# salvado
df_procesado.to_csv(path_salida, index = False)

## Recomendación a mostrar

El siguiente procesamiento le da formato al dataframe a mostrar.

In [13]:
# limpieza
df_limpio_parcial_inicio = df_clean_parcial(df_inicio, ',', columns_numeric).drop_duplicates().reset_index(drop = True)
df_limpio_parcial = clean_procesador(df_limpio_parcial_inicio)
# reordenamiento
df_limpio_parcial_orden  = df_limpio_parcial[['producto_nombre', 'marca', 'color', 'sistema_operativo', 'memoria', 'ram', 'precio', 'camara_trasera', 'camara_frontal', 'pantalla', 'tecnologia', 'procesador', 'peso']]
# nombres
df_limpio_parcial_orden.columns = ['Nombre', 'Marca', 'Color', 'Sistema operativo', 'Memoria', 'Ram', 'Precio', 'Camara Trasera', 'Camara Frontal', 'Pantalla', 'Tecnologia', 'Procesador', 'Peso']
df_limpio_parcial_orden['Peso'] = df_limpio_parcial_orden['Peso'] * 1000    

In [14]:
# lowercase al nombre de los productos
df_limpio_parcial_orden['producto_nombre'] = df_limpio_parcial_orden['Nombre'].str.lower()

In [15]:
df_limpio_parcial_orden

Unnamed: 0,Nombre,Marca,Color,Sistema operativo,Memoria,Ram,Precio,Camara Trasera,Camara Frontal,Pantalla,Tecnologia,Procesador,Peso,producto_nombre
0,Galaxy Z Fold2,Samsung,Negro,Android 10,256,12,46799,12,10,AMOLED,2G/3G/4G/4GLTE/5G,Qualcomm,282.0,galaxy z fold2
1,Galaxy Z Fold2,Samsung,Bronce,Android 10,256,12,46799,12,10,AMOLED,2G/3G/4G/4GLTE/5G,Qualcomm,282.0,galaxy z fold2
2,iPhone 11 Pro Max,Apple,Verde Medianoche,Apple iOS 13,512,4,33999,12,12,OLED,4GLTE,Apple,272.0,iphone 11 pro max
3,iPhone 11 Pro Max,Apple,Gris Espacial,Apple iOS 13,512,4,33999,12,12,OLED,4GLTE,Apple,272.0,iphone 11 pro max
4,iPhone 11 Pro Max,Apple,Plata,Apple iOS 13,512,4,33999,12,12,OLED,4GLTE,Apple,272.0,iphone 11 pro max
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,Moto E6 Play,Motorola,Negro,Android 8,16,1,2499,13,8,IPS-LCD,4GLTE,Qualcomm,190.0,moto e6 play
123,Moto E6 Plus,Motorola,Azul,Android 8,32,2,2990,13,8,LCD,4GLTE,MediaTek,149.0,moto e6 plus
124,Honor 8A,Huawei,Verde,Android 9,32,2,3299,13,8,LCD,4GLTE,MediaTek,176.0,honor 8a
125,Moto E6 Play,Motorola,Azul,Android 8,16,1,2499,13,5,IPS-LCD,4GLTE,Qualcomm,190.0,moto e6 play


In [16]:
# salvado de los datos con el formato bonito
df_limpio_parcial_orden.to_csv(path_salida_formato, index = False)