# Modelado
Aquí modelaremos

In [1]:
# dependencies
import numpy  as np
import pandas as pd
from   sklearn.impute          import SimpleImputer
from   sklearn.preprocessing   import OneHotEncoder, StandardScaler
# from   sklearn.model_selection import train_test_split
from   sklearn.pipeline        import Pipeline
from   sklearn.tree            import DecisionTreeClassifier

In [2]:
# data
path = 'work/data/processed/celulares_procesasdos.csv'

In [3]:
# loading
df_inicio = pd.read_csv(path)

In [4]:
# variables globales
# estas columnas serán ignoradas durante el modelado
columnas_ignorar     = {'color', 'pantalla'}
# variable objetivo
columna_objetivo     = 'producto_nombre'
# columnas categoricas
columnas_categoricas = ['marca', 'procesador', 'sistema_operativo', 'tecnologia']
# columnas numericas
columnas_numericas   = ['peso', 'camara_trasera', 'camara_frontal', 'ram', 'memoria', 'precio']
# variables predictoras
columnas_predictoras = columnas_numericas + columnas_categoricas

In [5]:
# elimino columnas, duplicados, y reordeno las columnas
df = df_inicio.drop(columns = columnas_ignorar).drop_duplicates().reset_index(drop = True)[columnas_predictoras + [columna_objetivo]]

In [6]:
df

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g,galaxy z fold2
1,0.272,12,12,4,512,33999,apple,apple,ios,4glte,iphone 11 pro max
2,0.252,12,12,4,512,31279,apple,apple,ios,4glte,iphone 11 pro
3,0.302,12,10,8,256,30599,samsung,qualcomm,android,4g,galaxy note 20 ultra
4,0.183,12,10,8,256,29699,samsung,qualcomm,android,4glte,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
77,0.190,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
79,0.176,13,8,2,32,3299,huawei,mediatek,android,4glte,honor 8a


## Train and test split
Se ocuparán todos los datos para el modelado, se reportará el cross validation error.

In [7]:
predictores = df.drop(columns = columna_objetivo)
objetivo    = df[[columna_objetivo]]

### Llenado de valores faltantes
En ambos casos de usará el valor más frecuente: el promedio para las variables númericas, para las variables categoricas la moda.

In [8]:
def feature_engineer_numeric(dataframe, columnas_numericas):
    df = dataframe.copy()
    # imputación
    imputador_numeric   = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(df[columnas_numericas])
    df_imputed = imputador_numeric.transform(df[columnas_numericas])    
    # estandarización
    estandarizador = StandardScaler().fit(df_imputed)
    df_standarizado = estandarizador.transform(df_imputed)
    # add names
    df_nombres = pd.DataFrame(df_standarizado, columns = columnas_numericas).reset_index(drop = True)
    df_salida  = pd.concat([df_nombres, dataframe.drop(columns = columnas_numericas)], axis = 1)
    #
    return df_salida


In [9]:
# feature_engineer_numeric(df, columnas_numericas)
# df.drop(columns = columnas_numericas)

In [10]:
def feature_engineer_categoric(dataframe, columnas_categoricas):
    df = dataframe.copy()
    # imputacion
    imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent').fit(df[columnas_categoricas])
    df_imputed = imputador_categoric.transform(df[columnas_categoricas])
    # ohe
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df_imputed)
    df_ohe = encoder.transform(df_imputed)
    # add names
    columnas_nombres = encoder.get_feature_names()
    df_nombres = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # gather results
    df_salida = pd.concat([dataframe.drop(columns = columnas_categoricas), df_nombres], axis = 1)
    #
    return df_salida

In [11]:
# feature_engineeric_categoric(df, columnas_categoricas)

In [12]:
def feature_engineer(dataframe, columnas_numericas, columnas_categoricas):
    df_numeric   = feature_engineer_numeric(dataframe,    columnas_numericas)
    df_categoric = feature_engineer_categoric(df_numeric, columnas_categoricas)
    # salida
    return df_categoric

In [15]:
predictores_transformed = feature_engineer(predictores, columnas_numericas, columnas_categoricas)

In [44]:
def ohe_objetivo(dataframe):
    df = dataframe.copy()
    # 
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df)
    df_ohe  = encoder.transform(df)
    # format
    columnas_nombres = encoder.get_feature_names()
    df_nombres       = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # salida
    return df_nombres

In [45]:
# ohe_objetivo(objetivo)
# OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(objetivo).transform(objetivo)

Unnamed: 0,x0_10l t770b,x0_galaxy a11,x0_galaxy a20s,x0_galaxy a21s,x0_galaxy a30s,x0_galaxy a31,x0_galaxy a51,x0_galaxy a71,x0_galaxy j8,x0_galaxy note 10,...,x0_moto g8 plus,x0_moto g8 power,x0_moto g9 play,x0_moto one action,x0_moto one hyper,x0_moto one vision,x0_moto one zoom,x0_motorola edge,x0_nokia 5.1 plus,x0_redmi note 9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creación de un pipeline  

Para facilitarnos el trabajo, crearé un pipeline

In [None]:
df[columnas_categoricas]
df[columnas_numericas]

array(['marca', 'peso', 'camara_trasera', 'camara_frontal', 'procesador',
       'ram', 'memoria', 'sistema_operativo', 'precio', 'tecnologia'],
      dtype=object)

In [37]:
df

Unnamed: 0,marca,procesador,sistema_operativo,tecnologia,peso,camara_trasera,camara_frontal,ram,memoria,precio,producto_nombre
0,samsung,qualcomm,android,5g,0.282,12,10,12,256,46799,galaxy z fold2
1,apple,apple,ios,4glte,0.272,12,12,4,512,33999,iphone 11 pro max
2,apple,apple,ios,4glte,0.252,12,12,4,512,31279,iphone 11 pro
3,samsung,qualcomm,android,4g,0.302,12,10,8,256,30599,galaxy note 20 ultra
4,samsung,qualcomm,android,4glte,0.183,12,10,8,256,29699,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,motorola,qualcomm,android,4glte,0.149,13,8,2,32,2990,moto e6 plus
77,motorola,qualcomm,android,4glte,0.190,13,8,1,16,2499,moto e6 play
78,motorola,mediatek,android,4glte,0.149,13,8,2,32,2990,moto e6 plus
79,huawei,mediatek,android,4glte,0.176,13,8,2,32,3299,honor 8a
