# Modelado
Aquí modelaremos

In [1]:
# dependencies
import numpy  as np
import pandas as pd
from   sklearn.impute          import SimpleImputer
from   sklearn.preprocessing   import OneHotEncoder, StandardScaler
from   sklearn.pipeline        import Pipeline
from   sklearn.tree            import DecisionTreeClassifier
from   sklearn.model_selection import cross_val_score

In [2]:
# data
path = 'work/data/processed/celulares_procesasdos.csv'

In [3]:
# loading
df_inicio = pd.read_csv(path)

In [4]:
# variables globales
# estas columnas serán ignoradas durante el modelado
columnas_ignorar     = {'color', 'pantalla'}
# variable objetivo
columna_objetivo     = 'producto_nombre'
# columnas categoricas
columnas_categoricas = ['marca', 'procesador', 'sistema_operativo', 'tecnologia']
# columnas numericas
columnas_numericas   = ['peso', 'camara_trasera', 'camara_frontal', 'ram', 'memoria', 'precio']
# variables predictoras
columnas_predictoras = columnas_numericas + columnas_categoricas

In [5]:
# elimino columnas, duplicados, y reordeno las columnas
df = df_inicio.drop(columns = columnas_ignorar).drop_duplicates().reset_index(drop = True)[columnas_predictoras + [columna_objetivo]]

In [7]:
df

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g,galaxy z fold2
1,0.272,12,12,4,512,33999,apple,apple,ios,4glte,iphone 11 pro max
2,0.252,12,12,4,512,31279,apple,apple,ios,4glte,iphone 11 pro
3,0.302,12,10,8,256,30599,samsung,qualcomm,android,4g,galaxy note 20 ultra
4,0.183,12,10,8,256,29699,samsung,qualcomm,android,4glte,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
77,0.190,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
79,0.176,13,8,2,32,3299,huawei,mediatek,android,4glte,honor 8a


## Train and test split
Se ocuparán todos los datos para el modelado, se reportará el cross validation error.

In [8]:
predictores = df.drop(columns = columna_objetivo)
objetivo    = df[[columna_objetivo]]

### Llenado de valores faltantes
En ambos casos de usará el valor más frecuente: el promedio para las variables númericas, para las variables categoricas la moda.

In [9]:
def feature_engineer_numeric(dataframe, columnas_numericas):
    df = dataframe.copy()
    # imputación
    imputador_numeric   = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(df[columnas_numericas])
    df_imputed = imputador_numeric.transform(df[columnas_numericas])    
    # estandarización
    estandarizador = StandardScaler().fit(df_imputed)
    df_standarizado = estandarizador.transform(df_imputed)
    # add names
    df_nombres = pd.DataFrame(df_standarizado, columns = columnas_numericas).reset_index(drop = True)
    df_salida  = pd.concat([df_nombres, dataframe.drop(columns = columnas_numericas)], axis = 1)
    #
    return df_salida

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,tecnologia,sistema_operativo
0,0.0,0.0,-1.384117,0.0,0.0,-0.98314,,,,android


In [10]:
# feature_engineer_numeric(df, columnas_numericas)
# df.drop(columns = columnas_numericas)

In [13]:
def feature_engineer_categoric(dataframe, columnas_categoricas):
    df = dataframe.copy()
    # imputacion
    imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent').fit(df[columnas_categoricas])
    df_imputed = imputador_categoric.transform(df[columnas_categoricas])
    # ohe
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df_imputed)
    df_ohe = encoder.transform(df_imputed)
    # add names
    columnas_nombres = encoder.get_feature_names()
    df_nombres = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # gather results
    df_salida = pd.concat([dataframe.drop(columns = columnas_categoricas), df_nombres], axis = 1)
    #
    return df_salida

Unnamed: 0,peso,camara_trasera,ram,memoria,precio,camara_frontal,x0_apple,x0_huawei,x0_motorola,x0_nokia,...,x1_apple,x1_arm,x1_mediatek,x1_qualcomm,x1_samsung,x2_android,x2_ios,x3_4g,x3_4glte,x3_5g
0,,,,,4000,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
# feature_engineeric_categoric(df, columnas_categoricas)

In [15]:
def feature_engineer(dataframe, columnas_numericas, columnas_categoricas):
    df_numeric   = feature_engineer_numeric(dataframe,    columnas_numericas)
    df_categoric = feature_engineer_categoric(df_numeric, columnas_categoricas)
    # salida
    return df_categoric

In [16]:
predictores_transformed = feature_engineer(predictores, columnas_numericas, columnas_categoricas)

In [31]:
def ohe_objetivo(dataframe):
    df = dataframe.copy()
    # 
    encoder = OneHotEncoder(handle_unknown = 'error', sparse = False).fit(df)
    df_ohe  = encoder.transform(df)
    # format
    columnas_nombres = encoder.get_feature_names()
    df_nombres       = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # salida
    return df_nombres, encoder

In [32]:
(objetivo_transformed, encoder_objetivo) = ohe_objetivo(objetivo)
# OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(objetivo).transform(objetivo)

## Modelado

Para facilitarnos el trabajo, crearé un pipeline

In [58]:
# puercamente un DecisionTreeClasifier
from sklearn.metrics import roc_auc_score
# predictores_transformed
modelo = DecisionTreeClassifier(random_state = 0)
#
cross_val_score(modelo, predictores_transformed, objetivo_transformed, cv = 5)

array([0.58823529, 0.6875    , 0.1875    , 0.25      , 0.1875    ])

In [121]:
# grid search 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, hamming_loss, log_loss
#
scorer_accuracy = make_scorer(accuracy_score)
scorer_f1       = make_scorer(f1_score, average = 'micro')
#
parameters = {'max_depth': [2, 4, 6], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3, 4]}
#  {'kernel':('linear', 'rbf'), 'C':[1, 10]}
modelo = GridSearchCV(DecisionTreeClassifier(random_state = 59), parameters, n_jobs = 6, scoring = scorer_f1, cv = 3)
modelo.fit(X = predictores_transformed, y = objetivo_transformed)
modelo_mejor = modelo.best_estimator_
print (modelo.best_score_, modelo.best_params_) 

0.12132352941176472 {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 3}


In [122]:
# modelo_mejor.predict(predictores_transformed)

In [22]:
# grid search 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble        import RandomForestClassifier
from sklearn.metrics         import make_scorer, accuracy_score, f1_score
#
scorer_accuracy = make_scorer(accuracy_score)
scorer_f1       = make_scorer(f1_score, average = 'micro')
#
parameters = {'max_depth': [2, 4, 6], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3, 4], 'n_estimators': [5, 10, 20]}
# 
modelo = GridSearchCV(RandomForestClassifier(max_samples = 0.75, random_state = 59, oob_score = True, max_features = 4), parameters, n_jobs = 6, scoring = scorer_f1, cv = 4)
modelo.fit(X = predictores_transformed, y = objetivo_transformed)
modelo_mejor = modelo.best_estimator_
print (modelo.best_score_, modelo.best_params_) 

0.16666666666666669 {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 1, 'n_estimators': 20}


In [54]:
# prediction for all
prediction_array = modelo_mejor.predict(predictores_transformed)
prediction = encoder_objetivo.inverse_transform(prediction_array)

In [73]:
# prediction for individual example
ejemplo_predictores = predictores_transformed.head(1)
ejemplo_prediccion  = modelo_mejor.predict(ejemplo_predictores)
ejemplo_prediccion  = encoder_objetivo.inverse_transform(ejemplo_prediccion)
pd.DataFrame({'producto_nombre': ejemplo_prediccion[0]}).merge(df, on = 'producto_nombre', how = 'inner')

Unnamed: 0,producto_nombre,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia
0,10l t770b,0.26,16,48,6,32,6590,tcl,qualcomm,android,4g


In [87]:
# prediction using incomplete information
example_partial = pd.DataFrame({'precio': 4000, 'sistema_operativo': 'android', 'camara_frontal': 2}, index = [0])
# add missing columns
# empty dataframe
df_empty = pd.DataFrame([[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, '', '', '', '']], columns = columnas_predictoras)
# substituyo valores
print(df_empty) 
df_example_partial = pd.concat([df_empty.drop(columns = example_partial.columns), example_partial], axis = 1)
df_example_partial

   peso  camara_trasera  camara_frontal  ram  memoria  precio marca  \
0   NaN             NaN             NaN  NaN      NaN     NaN         

  procesador sistema_operativo tecnologia  
0                                          


Unnamed: 0,peso,camara_trasera,ram,memoria,marca,procesador,tecnologia,precio,sistema_operativo,camara_frontal
0,,,,,,,,4000,android,2


## Preparación ejemplos nuevos

Aquí transformamos la entrada del usuario.

In [94]:
# feature engineer for incomplete data
def feature_engineer_numeric_new_data(dataframe_parcial, dataframe_completo, columnas_numericas):
    df_completo = dataframe_completo.copy()
    df_parcial = dataframe_parcial.copy()
    ##  transformations
    # imputacion
    imputador_numeric = SimpleImputer(missing_values = np.nan, strategy = 'mean').fit(df_completo[columnas_numericas])
    df_imputed        = imputador_numeric.transform(df_parcial[columnas_numericas])
    # estandarizado
    estandarizador  = StandardScaler().fit(df_completo[columnas_numericas])
    df_standarizado = estandarizador.transform(df_imputed)
    # add names
    df_nombres = pd.DataFrame(df_standarizado, columns = columnas_numericas).reset_index(drop = True)
    df_salida  = pd.concat([df_nombres, df_parcial.drop(columns = columnas_numericas)], axis = 1)
    # 
    return df_salida
# feature_engineer_numeric_new_data(df_example_partial, predictores, columnas_numericas)

In [95]:
# ingenieria de variables para las variables categoricas de los ejemplos nuevos
def feature_engineer_categoric_new_data(dataframe_parcial, dataframe_completo, columnas_categoricas):
    df_completo = dataframe_completo.copy()
    df_parcial  = dataframe_parcial.copy()
    # imputacion
    imputador_categoric = SimpleImputer(missing_values = None, strategy = 'most_frequent').fit(df_completo[columnas_categoricas])
    df_imputed = imputador_categoric.transform(df_parcial[columnas_categoricas])
    # ohe
    encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False).fit(df_completo[columnas_categoricas])
    df_ohe = encoder.transform(df_imputed)
    # add names
    columnas_nombres = encoder.get_feature_names()
    df_nombres = pd.DataFrame(df_ohe, columns = columnas_nombres).reset_index(drop = True)
    # gather results
    df_salida = pd.concat([dataframe_parcial.drop(columns = columnas_categoricas), df_nombres], axis = 1)
    #
    return df_salida
# test
# feature_engineer_categoric_new_data(df_example_partial, predictores, columnas_categoricas)

In [99]:
def feature_engineer_new_data(dataframe_parcial, dataframe_completo, columnas_numericas, columnas_categoricas):
    df_completo = dataframe_completo.copy()
    df_parcial = dataframe_parcial.copy()
    # magic
    df_numeric   = feature_engineer_numeric_new_data(df_parcial,   df_completo, columnas_numericas)
    df_categoric = feature_engineer_categoric_new_data(df_numeric, df_completo, columnas_categoricas)
    # 
    df_salida = df_categoric
    return df_salida

In [100]:
feature_engineer_new_data(df_example_partial, predictores, columnas_numericas, columnas_categoricas)

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,x0_apple,x0_huawei,x0_motorola,x0_nokia,...,x1_apple,x1_arm,x1_mediatek,x1_qualcomm,x1_samsung,x2_android,x2_ios,x3_4g,x3_4glte,x3_5g
0,0.0,0.0,-1.384117,0.0,0.0,-0.98314,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [160]:
df.sort_values(by = 'producto_nombre').head(20)

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
48,0.26,16,48,6,32,6590,tcl,qualcomm,android,4g,10l t770b
68,0.11,13,8,3,32,4490,samsung,qualcomm,android,4glte,galaxy a11
73,0.11,13,8,3,32,3990,samsung,qualcomm,android,4glte,galaxy a11
65,0.11,13,8,3,48,4949,samsung,qualcomm,android,4glte,galaxy a11
66,0.185,13,8,4,32,4769,samsung,qualcomm,android,4glte,galaxy a20s
69,0.185,13,8,4,32,4190,samsung,qualcomm,android,4glte,galaxy a20s
63,0.185,13,8,4,32,4999,samsung,qualcomm,android,4glte,galaxy a20s
54,0.192,48,13,4,64,5999,samsung,qualcomm,android,4glte,galaxy a21s
55,0.188,5,16,4,32,5999,samsung,qualcomm,android,4glte,galaxy a30s
58,0.188,5,16,4,32,5490,samsung,qualcomm,android,4glte,galaxy a30s


In [161]:
df.sort_values(by = 'producto_nombre').tail(20)

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
10,0.308,12,7,4,512,24303,apple,apple,ios,4glte,iphone xs max
40,0.195,48,20,6,128,8999,xiaomi,qualcomm,android,4glte,m9 se
80,0.19,13,5,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
77,0.19,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
74,0.188,8,13,2,32,3490,motorola,mediatek,android,4glte,moto g8 play
64,0.188,48,25,4,64,4990,motorola,qualcomm,android,4glte,moto g8 plus
67,0.188,48,25,4,64,4590,motorola,qualcomm,android,4glte,moto g8 plus
62,0.188,16,48,4,32,4999,motorola,qualcomm,android,4glte,moto g8 power


In [133]:
modelo_mejor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': 0.75,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': True,
 'random_state': 59,
 'verbose': 0,
 'warm_start': False}

In [69]:
df

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia,producto_nombre
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g,galaxy z fold2
1,0.272,12,12,4,512,33999,apple,apple,ios,4glte,iphone 11 pro max
2,0.252,12,12,4,512,31279,apple,apple,ios,4glte,iphone 11 pro
3,0.302,12,10,8,256,30599,samsung,qualcomm,android,4g,galaxy note 20 ultra
4,0.183,12,10,8,256,29699,samsung,qualcomm,android,4glte,galaxy z flip
...,...,...,...,...,...,...,...,...,...,...,...
76,0.149,13,8,2,32,2990,motorola,qualcomm,android,4glte,moto e6 plus
77,0.190,13,8,1,16,2499,motorola,qualcomm,android,4glte,moto e6 play
78,0.149,13,8,2,32,2990,motorola,mediatek,android,4glte,moto e6 plus
79,0.176,13,8,2,32,3299,huawei,mediatek,android,4glte,honor 8a


In [71]:
predictores.head(1)

Unnamed: 0,peso,camara_trasera,camara_frontal,ram,memoria,precio,marca,procesador,sistema_operativo,tecnologia
0,0.282,12,10,12,256,46799,samsung,qualcomm,android,5g
