# Modelado
Aquí modelaremos

In [30]:
# dependencies
import numpy  as np
import pandas as pd
from   sklearn.impute          import SimpleImputer
from   sklearn.preprocessing   import OneHotEncoder, StandardScaler
# from   sklearn.model_selection import train_test_split
from   sklearn.pipeline        import Pipeline
from   sklearn.tree            import DecisionTreeClassifier

In [2]:
# data
path = 'work/data/processed/celulares_procesasdos.csv'

In [3]:
# loading
df_inicio = pd.read_csv(path)

In [38]:
# variables globales
# estas columnas serán ignoradas durante el modelado
columnas_ignorar     = {'color', 'pantalla'}
# variable objetivo
columna_objetivo     = 'producto_nombre'
# columnas categoricas
columnas_categoricas = ['marca', 'procesador', 'sistema_operativo', 'tecnologia']
# columnas numericas
columnas_numericas   = ['peso', 'camara_trasera', 'camara_frontal', 'ram', 'memoria', 'precio']
# variables predictoras
columnas_predictoras = columnas_numericas + columnas_categoricas

In [39]:
# elimino columnas, duplicados, y reordeno las columnas
df = df_inicio.drop(columns = columnas_ignorar).drop_duplicates().reset_index(drop = True)[columnas_predictoras + [columna_objetivo]]

In [40]:
df

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g,galaxy z fold2
1,0.272,12,12,4,512,33999,apple,apple,ios,4glte,iphone 11 pro max
2,0.252,12,12,4,512,31279,apple,apple,ios,4glte,iphone 11 pro
3,0.302,12,10,8,256,30599,samsung,qualcomm,android,4g,galaxy note 20 ultra
4,0.183,12,10,8,256,29699,samsung,qualcomm,android,4glte,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
77,0.190,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
79,0.176,13,8,2,32,3299,huawei,mediatek,android,4glte,honor 8a


## Train and test split
Se ocuparán todos los datos para el modelado, se reportará el cross validation error.

In [29]:
predictores = df.drop(columns = columna_objetivo)
objetivo    = df[[columna_objetivo]]

Unnamed: 0,producto_nombre
0,galaxy z fold2
1,iphone 11 pro max
2,iphone 11 pro
3,galaxy note 20 ultra
4,galaxy z flip
...,...
76,moto e6 plus
77,moto e6 play
78,moto e6 plus
79,honor 8a


### Llenado de valores faltantes
En ambos casos de usará el valor más frecuente: el promedio para las variables númericas, para las variables categoricas la moda.

In [36]:
imputador_numeric   = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(df[columnas_numericas])
# imputador_numeric.transform(df[columnas_numericas])
# normalización de variables continuas
estandarizador = StandardScaler().fit(predictores[columnas_numericas])
# estandarizador.transform(df[columnas_numericas])

In [103]:
def feature_engineer_numeric(dataframe, columnas_numericas):
    df = dataframe.copy()
    # imputación
    imputador_numeric   = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(df[columnas_numericas])
    df_imputed = imputador_numeric.transform(df[columnas_numericas])    
    # estandarización
    estandarizador = StandardScaler().fit(df_imputed)
    df_standarizado = estandarizador.transform(df_imputed)
    # add names
    df_nombres = pd.DataFrame(df_standarizado, columns = columnas_numericas).reset_index(drop = True)
    df_salida  = pd.concat([df_nombres, dataframe.drop(columns = columnas_numericas)], axis = 1)
    #
    return df_salida


In [104]:
# feature_engineer_numeric(df, columnas_numericas)
# df.drop(columns = columnas_numericas)

In [79]:
#

In [89]:
# variables categoricas
# imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent')
# imputador_categoric.fit(df[columnas_categoricas]).transform(df[columnas_categoricas])
# one hot encoder
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df[columnas_categoricas])
encoder.get_feature_names()
# encoder.transform(df[columnas_categoricas])

array(['x0_apple', 'x0_huawei', 'x0_motorola', 'x0_nokia', 'x0_samsung',
       'x0_tcl', 'x0_xiaomi', 'x1_apple', 'x1_arm', 'x1_mediatek',
       'x1_qualcomm', 'x1_samsung', 'x2_android', 'x2_ios', 'x3_4g',
       'x3_4glte', 'x3_5g'], dtype=object)

In [105]:
def feature_engineer_categoric(dataframe, columnas_categoricas):
    df = dataframe.copy()
    # imputacion
    imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent').fit(df[columnas_categoricas])
    df_imputed = imputador_categoric.transform(df[columnas_categoricas])
    # ohe
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df_imputed)
    df_ohe = encoder.transform(df_imputed)
    # add names
    columnas_nombres = encoder.get_feature_names()
    df_nombres = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # gather results
    df_salida = pd.concat([dataframe.drop(columns = columnas_categoricas), df_nombres], axis = 1)
    #
    return df_salida


In [100]:
# feature_engineeric_categoric(df, columnas_categoricas)

In [108]:
def feature_engineer(dataframe, columnas_numericas, columnas_categoricas):
    df_numeric   = feature_engineer_numeric(dataframe,    columnas_numericas)
    df_categoric = feature_engineer_categoric(df_numeric, columnas_categoricas)
    # salida
    return df_categoric

In [109]:
feature_engineer(df, columnas_numericas, columnas_categoricas)

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,producto_nombre,x0_apple,x0_huawei,x0_motorola,...,x1_apple,x1_arm,x1_mediatek,x1_qualcomm,x1_samsung,x2_android,x2_ios,x3_4g,x3_4glte,x3_5g
0,0.983220,-0.581741,-0.542740,3.385583,0.933778,3.588932,galaxy z fold2,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.807257,-0.581741,-0.332396,-0.390420,3.106447,2.221551,iphone 11 pro max,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.455331,-0.581741,-0.332396,-0.390420,3.106447,1.930983,iphone 11 pro,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.335146,-0.581741,-0.542740,1.497582,0.933778,1.858341,galaxy note 20 ultra,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.758813,-0.581741,-0.542740,1.497582,0.933778,1.762197,galaxy z flip,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,-1.357087,-0.514038,-0.753084,-1.334421,-0.967307,-1.091034,moto e6 plus,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
77,-0.635639,-0.514038,-0.753084,-1.806421,-1.103099,-1.143486,moto e6 play,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
78,-1.357087,-0.514038,-0.753084,-1.334421,-0.967307,-1.091034,moto e6 plus,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
79,-0.881987,-0.514038,-0.753084,-1.334421,-0.967307,-1.058025,honor 8a,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Creación de un pipeline  

Para facilitarnos el trabajo, crearé un pipeline

In [None]:
df[columnas_categoricas]
df[columnas_numericas]

array(['marca', 'peso', 'camara_trasera', 'camara_frontal', 'procesador',
       'ram', 'memoria', 'sistema_operativo', 'precio', 'tecnologia'],
      dtype=object)

In [37]:
df

Unnamed: 0,marca,procesador,sistema_operativo,tecnologia,peso,camara_trasera,camara_frontal,ram,memoria,precio,producto_nombre
0,samsung,qualcomm,android,5g,0.282,12,10,12,256,46799,galaxy z fold2
1,apple,apple,ios,4glte,0.272,12,12,4,512,33999,iphone 11 pro max
2,apple,apple,ios,4glte,0.252,12,12,4,512,31279,iphone 11 pro
3,samsung,qualcomm,android,4g,0.302,12,10,8,256,30599,galaxy note 20 ultra
4,samsung,qualcomm,android,4glte,0.183,12,10,8,256,29699,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,motorola,qualcomm,android,4glte,0.149,13,8,2,32,2990,moto e6 plus
77,motorola,qualcomm,android,4glte,0.190,13,8,1,16,2499,moto e6 play
78,motorola,mediatek,android,4glte,0.149,13,8,2,32,2990,moto e6 plus
79,huawei,mediatek,android,4glte,0.176,13,8,2,32,3299,honor 8a
