# Template 

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
# Run this cell before close.
%watermark
%watermark --iversion
%watermark -b -r -g -p surprise

2020-06-18T10:54:36+00:00

CPython 3.7.7
IPython 7.15.0

compiler   : GCC 8.3.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : 
CPU cores  : 16
interpreter: 64bit
pandas 1.0.4
numpy  1.18.5

surprise 0.1
Git hash: b4348e2f24cd733e3f1939d40228356aa358edf2
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

In [5]:
!ls ../data/
path_data = '../data/'

__MACOSX		  estaticos_portfolio1.csv  features_dictionary.pdf
estaticos_market.csv	  estaticos_portfolio2.csv
estaticos_market.csv.zip  estaticos_portfolio3.csv


#### Carrega dataset só com as top colunas escolhidas na priveira análise exploratória.

In [8]:
top_cols = pd.read_csv('top_cols.csv')['cols'].to_list()

In [9]:
df_marked = pd.read_csv(path_data+'estaticos_market.csv', usecols=top_cols)

In [10]:
col_user = 'id'
top_cols.remove(col_user)

In [11]:
df_marked[top_cols].head()

Unnamed: 0,fl_simples_irregular,idade_empresa_anos,fl_antt,idade_maxima_socios,idade_media_socios,media_meses_servicos_all,vl_faturamento_estimado_aux,fl_ltda,max_meses_servicos_all,vl_idade_maxima_socios_pj,fl_me,fl_spa,qt_coligados_serviço,fl_matriz,media_meses_servicos,fl_sa,fl_st_especial,vl_total_veiculos_pesados_grupo,fl_mei,fl_epp
0,False,14.457534,False,47.0,44.0,13.309195,3132172.8,False,93.266667,,False,False,5.0,True,43.738462,False,False,0.0,False,False
1,False,1.463014,False,27.0,27.0,,210000.0,False,,,False,False,,True,,False,False,0.0,True,False
2,False,7.093151,False,32.0,32.0,,50000.0,False,,,False,False,,True,,False,False,0.0,True,False
3,False,6.512329,False,36.0,36.0,,210000.0,False,,,False,False,,True,,False,False,0.0,False,False
4,False,3.2,False,,,,210000.0,False,,,False,False,,True,,False,False,0.0,False,False


No caso de uso da biblioteca Surprise, é necessário normalizar os valores numéricos.
Quanto aos valores booleanos, teremos rating binário min/max.

In [12]:
df_marked.dtypes[13] == float

True

In [13]:
df_marked.dtypes.unique()

array([dtype('O'), dtype('bool'), dtype('float64')], dtype=object)

In [14]:
rest_cols = []
for col in top_cols:
    if df_marked[col].dtype in [float, int, bool]:
        df_marked[col] = df_marked[col].fillna(0)*1
    else:
        rest_cols.append(col)
        print("{}: {}".format(col,df_marked[col].unique()))

fl_simples_irregular: [False nan True]
fl_antt: [False True nan]
fl_spa: [False nan True]


In [15]:
for col in rest_cols:
    df_marked[col] = df_marked[col].fillna(0)*1

In [16]:
df_marked[top_cols].head()

Unnamed: 0,fl_simples_irregular,idade_empresa_anos,fl_antt,idade_maxima_socios,idade_media_socios,media_meses_servicos_all,vl_faturamento_estimado_aux,fl_ltda,max_meses_servicos_all,vl_idade_maxima_socios_pj,fl_me,fl_spa,qt_coligados_serviço,fl_matriz,media_meses_servicos,fl_sa,fl_st_especial,vl_total_veiculos_pesados_grupo,fl_mei,fl_epp
0,0,14.457534,0,47.0,44.0,13.309195,3132172.8,0,93.266667,0.0,0,0,5.0,1,43.738462,0,0,0.0,0,0
1,0,1.463014,0,27.0,27.0,0.0,210000.0,0,0.0,0.0,0,0,0.0,1,0.0,0,0,0.0,1,0
2,0,7.093151,0,32.0,32.0,0.0,50000.0,0,0.0,0.0,0,0,0.0,1,0.0,0,0,0.0,1,0
3,0,6.512329,0,36.0,36.0,0.0,210000.0,0,0.0,0.0,0,0,0.0,1,0.0,0,0,0.0,0,0
4,0,3.2,0,0.0,0.0,0.0,210000.0,0,0.0,0.0,0,0,0.0,1,0.0,0,0,0.0,0,0


É necessário normalizar (deixar entre $[0, 1]$) e escalar para um valor que matenha algum nível de detalhe. Usarei uma escala de inteiros entre $[0, 100]$, deve ser o suficiente.

In [17]:
def normalize(x):
    return (x-np.min(x))/(np.max(x) - np.min(x)) if (np.max(x) - np.min(x)) > 0 else (x-np.min(x))

escala = 100
for col in top_cols:
    df_marked[col] = (escala*normalize(df_marked[col].tolist())).astype(np.uint8)

In [18]:
df_marked[top_cols].head()

Unnamed: 0,fl_simples_irregular,idade_empresa_anos,fl_antt,idade_maxima_socios,idade_media_socios,media_meses_servicos_all,vl_faturamento_estimado_aux,fl_ltda,max_meses_servicos_all,vl_idade_maxima_socios_pj,fl_me,fl_spa,qt_coligados_serviço,fl_matriz,media_meses_servicos,fl_sa,fl_st_especial,vl_total_veiculos_pesados_grupo,fl_mei,fl_epp
0,0,13,0,37,35,2,0,0,3,0,0,0,2,100,0,0,0,0,0,0
1,0,1,0,22,22,1,0,0,1,0,0,0,0,100,0,0,0,0,100,0
2,0,6,0,26,26,1,0,0,1,0,0,0,0,100,0,0,0,0,100,0
3,0,6,0,29,29,1,0,0,1,0,0,0,0,100,0,0,0,0,0,0
4,0,2,0,1,1,1,0,0,1,0,0,0,0,100,0,0,0,0,0,0


In [19]:
remove_cols = []
for col in top_cols:
    if df_marked[col].nunique() == 1:
        remove_cols.append(col)

In [20]:
df_marked = df_marked.drop(columns=remove_cols)

In [21]:
for col in remove_cols:
    top_cols.remove(col)

Agora temos todos os valores normalizados entre $[0, 100]$, podendo gerar o datset para o surprase.

- `'id'`: será o id de usuário (`user_id`), cada valor da coluna será considerado um usuário.

- `top_cols`: serão o ids de itens (`user_id`), cada coluna será considerado um item.

- valores nas `top_cols`: serão os ratings (`rating`).-

In [22]:
df_marked = pd.melt(df_marked, id_vars=["id"], var_name="itemID", value_name="rating").rename(columns={"id": "userID"})

In [23]:
df_marked.head()

Unnamed: 0,userID,itemID,rating
0,a6984c3ae395090e3bee8ad63c3758b110de096d5d8195...,fl_matriz,100
1,6178f41ade1365e44bc2c46654c2c8c0eaae27dcb476c4...,fl_matriz,100
2,4a7e5069a397f12fdd7fd57111d6dc5d3ba558958efc02...,fl_matriz,100
3,3348900fe63216a439d2e5238c79ddd46ede454df7b9d8...,fl_matriz,100
4,1f9bcabc9d3173c1fe769899e4fac14b053037b953a1e4...,fl_matriz,100


In [24]:
sum(df_marked.userID == df_marked.userID.loc[0])

19

Gerando dataset no formato para o Surprise.

In [25]:
reader = Reader(rating_scale=(0, escala))
data = Dataset.load_from_df(df_marked[['userID', 'itemID', 'rating']], reader)

Primeira tentativa com algoritmo.

PC:

In [143]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    18.6628 18.6729 18.7003 18.5618 18.7140 18.6624 0.0535  
MAE (testset)     9.7577  9.7562  9.5689  9.4380  9.4236  9.5889  0.1463  
Fit time          404.74  404.86  405.42  404.60  405.26  404.98  0.31    
Test time         23.36   22.18   23.18   21.84   22.53   22.62   0.58    


{'test_rmse': array([18.66279959, 18.67290274, 18.70028829, 18.56184596, 18.71402719]),
 'test_mae': array([9.7576914 , 9.7562032 , 9.56890128, 9.43800675, 9.42356088]),
 'fit_time': (404.742112159729,
  404.86360454559326,
  405.41846466064453,
  404.5967993736267,
  405.263263463974),
 'test_time': (23.35508918762207,
  22.177977561950684,
  23.18179440498352,
  21.835482120513916,
  22.5341579914093)}

In [144]:
len(top_cols)

19

In [145]:
top_cols

['fl_spa',
 'vl_faturamento_estimado_aux',
 'fl_me',
 'fl_matriz',
 'fl_sa',
 'vl_total_veiculos_pesados_grupo',
 'idade_media_socios',
 'media_meses_servicos_all',
 'vl_idade_maxima_socios_pj',
 'fl_mei',
 'fl_simples_irregular',
 'idade_empresa_anos',
 'media_meses_servicos',
 'fl_ltda',
 'fl_st_especial',
 'idade_maxima_socios',
 'max_meses_servicos_all',
 'qt_coligados_serviço',
 'fl_antt']

Para o Experimento B: variar a quantidade de colunas.

MAC:

In [26]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    18.6404 18.4858 18.6044 18.9520 18.6014 18.6568 0.1565  
MAE (testset)     9.5272  9.3915  9.3744  9.7034  9.5025  9.4998  0.1180  
Fit time          328.01  348.64  336.58  333.07  331.98  335.66  7.05    
Test time         29.56   36.61   27.69   27.29   27.55   29.74   3.53    


{'test_rmse': array([18.64040729, 18.48575272, 18.60441227, 18.95203643, 18.60140808]),
 'test_mae': array([9.52722349, 9.39153275, 9.37439127, 9.70341362, 9.50246426]),
 'fit_time': (328.00676560401917,
  348.641179561615,
  336.57781314849854,
  333.067321062088,
  331.9837055206299),
 'test_time': (29.5554678440094,
  36.61160111427307,
  27.694409132003784,
  27.290346145629883,
  27.550904273986816)}