# Modelado
Aquí modelaremos

In [1]:
# dependencies
import numpy  as np
import pandas as pd
from   sklearn.impute          import SimpleImputer
from   sklearn.preprocessing   import OneHotEncoder, StandardScaler
from   sklearn.model_selection import train_test_split
from   sklearn.pipeline        import Pipeline

In [2]:
# data
path = 'work/data/processed/celulares_procesasdos.csv'

In [3]:
# loading
df_inicio = pd.read_csv(path)

In [11]:
# variables globales
# estas columnas serán ignoradas durante el modelado
columnas_ignorar     = {'color', 'pantalla'}
# variable objetivo
columna_objetivo     = 'producto_nombre'
# columnas categoricas
columnas_categoricas = ['marca', 'procesador', 'sistema_operativo', 'tecnologia']
# columnas numericas
columnas_numericas   = ['peso', 'camara_trasera', 'camara_frontal', 'ram', 'memoria', 'precio']

In [16]:
# elimino columnas, duplicados, y reordeno las columnas
df = df_inicio.drop(columns = columnas_ignorar).drop_duplicates().reset_index(drop = True)[columnas_numericas + columnas_categoricas + [columna_objetivo]]

In [17]:
df

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g,galaxy z fold2
1,0.272,12,12,4,512,33999,apple,apple,ios,4glte,iphone 11 pro max
2,0.252,12,12,4,512,31279,apple,apple,ios,4glte,iphone 11 pro
3,0.302,12,10,8,256,30599,samsung,qualcomm,android,4g,galaxy note 20 ultra
4,0.183,12,10,8,256,29699,samsung,qualcomm,android,4glte,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
77,0.190,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
79,0.176,13,8,2,32,3299,huawei,mediatek,android,4glte,honor 8a


## Train and test split


In [19]:
df_x = df[columnas_numericas + columnas_categoricas]
df_y = df[[columna_objetivo]]
#
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state = 0)

In [25]:
x_test

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia
22,0.186,10,10,8,128,19990,samsung,qualcomm,android,4g
27,0.312,16,10,12,256,16999,samsung,samsung,android,4g
61,0.192,48,8,4,128,4999,huawei,arm,android,4glte
13,0.312,12,10,8,256,22990,samsung,qualcomm,android,4g
71,0.179,16,16,3,32,3999,samsung,qualcomm,android,4glte
74,0.188,8,13,2,32,3490,motorola,mediatek,android,4glte
30,0.186,16,10,8,128,15990,samsung,qualcomm,android,5g
55,0.188,5,16,4,32,5999,samsung,qualcomm,android,4glte
53,0.198,32,25,4,128,6490,motorola,qualcomm,android,4g
26,0.308,12,7,4,256,17599,apple,apple,ios,4glte


### Llenado de valores faltantes
En ambos casos de usará el valor más frecuente: el promedio para las variables númericas, para las variables categoricas la moda.

In [42]:
# normalización de variables continuas
estandarizador = StandardScaler()
estandarizador.fit(df[columnas_numericas])
# estandarizador.transform(df[columnas_numericas])
imputador_numeric   = SimpleImputer(missing_values = np.nan, strategy = 'mean')
# imputador_numeric.fit(df[columnas_numericas]).transform(df[columnas_numericas])

In [46]:
# variables categoricas
imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent')
# imputador_categoric.fit(df[columnas_categoricas]).transform(df[columnas_categoricas])

In [28]:
# codificación de variables categoricas
# one hot encoder
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
encoder.fit(df[columnas_categoricas]).transform(df[columnas_categoricas])

array([[0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

## Creación de un pipeline  

Para facilitarnos el trabajo, crearé un pipeline

In [54]:
columnas_predictoras = df.columns.drop(columna_objetivo).values

In [None]:
# instancing
tuberia = Pipeline(steps = [('imputacion_numericas', imputacion_numericas),
                  ('imputacion_categoricas', imputacion_categoricas),
                  ('standarizacion', estandarizador),
                  ('codificacion_categoricas', 'codificacion_categoricas'),
                  ('modelado', modelo)])
# fitting
tuberia.fit(df[columnas_predictoras], df[columna_objetivo]))
# transforming
# predict

In [None]:
df[columnas_categoricas]
df[columnas_numericas]

array(['marca', 'peso', 'camara_trasera', 'camara_frontal', 'procesador',
       'ram', 'memoria', 'sistema_operativo', 'precio', 'tecnologia'],
      dtype=object)

In [51]:
df

Unnamed: 0,marca,producto_nombre,peso,camara_trasera,camara_frontal,procesador,ram,memoria,sistema_operativo,precio,tecnologia
0,samsung,galaxy z fold2,0.282,12,10,qualcomm,12,256,android,46799,5g
1,apple,iphone 11 pro max,0.272,12,12,apple,4,512,ios,33999,4glte
2,apple,iphone 11 pro,0.252,12,12,apple,4,512,ios,31279,4glte
3,samsung,galaxy note 20 ultra,0.302,12,10,qualcomm,8,256,android,30599,4g
4,samsung,galaxy z flip,0.183,12,10,qualcomm,8,256,android,29699,4glte
...,...,...,...,...,...,...,...,...,...,...,...
76,motorola,moto e6 plus,0.149,13,8,qualcomm,2,32,android,2990,4glte
77,motorola,moto e6 play,0.190,13,8,qualcomm,1,16,android,2499,4glte
78,motorola,moto e6 plus,0.149,13,8,mediatek,2,32,android,2990,4glte
79,huawei,honor 8a,0.176,13,8,mediatek,2,32,android,3299,4glte
