# Experiment D

Ideas:

~~1) Implementar na classe do modelo o mapeamento entre empresas e `uid`.~~ Feito no C.

2) Implementar o passo de validação com os porfólios.

2.1) Definir métricas e implementar o gerador métricas.

3) Implementar um framework de busca de hiperparâmetros.

3.1) N fatores (`n_factors`) da decomposição FM.

3.2) N top colunas (`top_cols`) do dataset.

3.3) Parâmetro $L$ (`recomender(...,L,...)`).


- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
from typing import NewType, List
import functools
import operator
from collections import Counter, defaultdict
from copy import deepcopy
from time import time

import numpy as np
import pandas as pd
from surprise import SVD, accuracy, Dataset, Reader
from scipy.spatial.distance import cosine, cdist


from loguru import logger
from tqdm import tqdm

import pythran
import sys
eps = sys.float_info.epsilon*10

In [3]:
# Run this cell before close.
%watermark
%watermark -p loguru
%watermark -p scipy
%watermark -p surprise
%watermark -p pythran
%watermark --iversion
%watermark -b -r -g

2020-06-25T23:57:19-03:00

CPython 3.7.7
IPython 7.15.0

compiler   : GCC 9.3.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : x86_64
CPU cores  : 16
interpreter: 64bit
loguru 0.5.1
scipy 1.5.0
surprise 0.1
pythran 0.9.5
numpy   1.19.0
pandas  1.0.5
pythran 0.9.5

Git hash: eb4681cb4dbb1cd049b2defa19764186240e3526
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


From exp. C:

In [44]:


logger.info("Carregando e processando o dataset...")

path_data = '../data/'
top_cols = pd.read_csv('top_cols.csv')['cols'].to_list()
df_marked = pd.read_csv(path_data+'estaticos_market.csv', usecols=top_cols)
col_user = 'id'
top_cols.remove(col_user)

rest_cols = []
for col in top_cols:
    df_marked[col] = df_marked[col].fillna(0)*1

def normalize(x):
    return (x-np.min(x))/(np.max(x) - np.min(x)) if (np.max(x) - np.min(x)) > 0 else (x-np.min(x))

escala = 100
for col in top_cols:
    df_marked[col] = (escala*normalize(df_marked[col].tolist())).astype(np.uint8)
    
remove_cols = []
for col in top_cols:
    if df_marked[col].nunique() == 1:
        remove_cols.append(col)

df_marked = df_marked.drop(columns=remove_cols)
for col in remove_cols:
    top_cols.remove(col)

df_marked = pd.melt(df_marked, id_vars=["id"], var_name="itemID", value_name="rating").rename(columns={"id": "userID"})

reader = Reader(rating_scale=(0, escala))
#data = Dataset.load_from_df(df_marked[['userID', 'itemID', 'rating']].sample(frac=0.2), reader)
data = Dataset.load_from_df(df_marked[['userID', 'itemID', 'rating']], reader)
del df_marked

df_ep_list = [pd.read_csv(path_data+'estaticos_portfolio{}.csv'.format(i+1)) for i in range(3)]
tmp = []
for i in range(3):
    df_ep_list[i]['P'] = i+1 
    tmp.append(df_ep_list[i][['id','P']])
df_ep = pd.concat(tmp)
del df_ep_list
del tmp

logger.info("...pronto!")

Uid = NewType('uid', int)
Raw = NewType('raw', str)

def flat(a):
    return functools.reduce(operator.iconcat, a, [])

class ExSVD(SVD):
    """
        Classe extendida da surprise.SVD.
        
    
    """
    
    def __init__(self,stateless: bool = False, **args):
        self.matrix_dict = {}
        self.matrix_dict_2 = {}
        self.stateless = stateless
        super().__init__(**args)

    def fit(self,trainset: Dataset):
        """
            Reimplementei a SVD.fit para colocar um logger nível INFO.
        """
        #logger.info("Treinando modelo SVD...")
        super().fit(trainset)
        #logger.info("Pronto!")
    
    def _get_neighbors(self,uid: Uid, k: int = 1, black_list: List[Uid] = []) -> List[Uid]:
        """
            Calcula todas as distâncias entre 'uid' de entrada e todos os outros 'uid'.
            A distância calciulada é armazenda e não calculada novamente. 
        """
        black_list.append(uid)
        k = k if k >= 0 else 0
        #logger.info("Calculando todos os vizinhos...")
        #for uid2 in tqdm(self.trainset.all_users()):
        for uid2 in self.trainset.all_users():
            ordered = tuple(sorted((uid,uid2)))
            if (uid2 not in black_list) and (ordered not in self.matrix_dict.keys()):
                self.matrix_dict[ordered] = cosine(self.pu[uid],self.pu[uid2])
        out = [x[0] for x in sorted(
            [
                (uid2, self.matrix_dict[tuple(sorted((uid,uid2)))]) 
                for uid2 in self.trainset.all_users()
                if (uid2 not in black_list)
            ], key=lambda x: x[1])][:k-1]
        if self.stateless:
            del self.matrix_dict
            self.matrix_dict = {}
        return out

    def _get_neighbors_2(self,uid: Uid, k: int = 1, black_list: List[Uid] = []) -> List[Uid]:
        """
            Calcula todas as distâncias entre 'uid' de entrada e todos os outros 'uid'.
            A distância calciulada é armazenda e não calculada novamente. 
        """
        black_list.append(uid)
        k = k if k >= 0 else 0
        #logger.info("Calculando todos os vizinhos...")
        #for uid2 in tqdm(self.trainset.all_users()):
        if uid not in self.matrix_dict_2.keys():
            Un = transformer(self.pu).astype(np.int8)
            self.matrix_dict_2[uid] = vector_distance_pythran(Un,Un[uid])
        out = [x[0] for x in sorted(
            [
                (uid2, self.matrix_dict_2[uid][uid2])
                for uid2 in self.trainset.all_users()
                if (uid2 not in black_list)
            ], key=lambda x: x[1])][:k-1]
        if self.stateless:
            del self.matrix_dict
            self.matrix_dict = {}
        return out
    
    def _uid2raw(self, uid: Uid)-> str:
        '''
            uid -> raw.
            Valor interno para externo, o nome original do usuário.
        '''
        return self.trainset.to_raw_uid(uid)
    
    def _raw2uid(self, raw: Raw)-> int:
        '''
            raw -> uid.
            Valor externo para interno, o id interno do usuários..
        '''
        return self.trainset.to_inner_uid(raw)
    
    def recomender(self, in_list: List[Raw], k: int = 1, L: int = 3, Fk: int = 1, limit: int = 100)-> List[Raw]:
        '''
            Faz as recomendacoes.
            ##### Função incompleta #####
        '''
        # Pega quantas recomendações por usuário em `in_list`,
        # mas sem deixar faltar
        N_in = len(in_list)
        k = k if k > 0 else 1
        R_per_in = L*(k//N_in + min(k%N_in,1))

        # Pega os `uid`
        uid_in_list = []
        for raw in in_list:
            uid_in_list.append(self._raw2uid(raw))

        # Pega os vizinhos mais próximos de cada uid de entrada.
        done = False
        flag = True
        Rounds = 0
        while limit and (not done):
            Rounds += 1
            # Ele sempre pega todos novamente.
            recomendations_list = []
            for i,uid in enumerate(uid_in_list):
                #logger.info("Calculando todos os vizinhos...{:,}/{:,} (Round: {:,}).".format(i+1,N_in,Rounds))
                #recomendations_list.append(self._get_neighbors(uid,R_per_in,in_list))
                recomendations_list.append(self._get_neighbors_2(uid,R_per_in,in_list))
            # Quando limit = 0, encerra.
            limit -= 1
            # Quando tem gente o suficiente, encerra.
            if len(set(flat(recomendations_list))) >= Fk*k:
                done = True
            # Depois do primeiro loop, pega um a mais.
            R_per_in += 1

        # Aqui gera um dicionário ordenando por votacao.
        count_rec = Counter(flat(recomendations_list)) # A votação!!
        count_rec = list(count_rec.items())
        ct_pos = defaultdict(list)
        #ct_pos_inv = defaultdict(list)
        while count_rec:
            tmp = count_rec.pop(0)
            ct_pos[tmp[1]].append(tmp[0])
            #ct_pos_inv[tmp[0]].append(tmp[1])

        # Aqui considera a posiçao de vizinhos mais proximos.
        #nn_pos = defaultdict(list)
        nn_pos_inv = defaultdict(list)
        tmp = deepcopy(recomendations_list)
        while tmp:
            tmp2 = tmp.pop(0)
            n = 0
            while tmp2:
                n += 1
                tmp3 = tmp2.pop(0)
                #nn_pos[n].append(tmp3)
                nn_pos_inv[tmp3].append(n)

        # Vai separando por votação e ordem de proximidade como desempate.      
        votos_list = list(ct_pos.keys())
        out_uid = []
        while votos_list and k:
            votos = max(votos_list)
            votos_list.remove(votos)
            tmp = sorted([(tmp, min(nn_pos_inv[tmp])) for tmp in ct_pos[votos]], key=lambda x: x[1])
            while tmp and k:
                out_uid.append(tmp.pop(0)[0])
                k -= 1

        # converte para Raw e "joga fora".
        return [self._uid2raw(uid) for uid in out_uid]

ex_algo = ExSVD(stateless=True, n_factors=5, n_epochs=5, verbose=True)
ex_algo.fit(data.build_full_trainset())

2020-06-26 00:18:13.350 | INFO     | __main__:<module>:1 - Carregando e processando o dataset...
2020-06-26 00:18:32.446 | INFO     | __main__:<module>:45 - ...pronto!


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


Passo de validação simples, para cada uma empresa no portfólio pegar N recomendações e ver se uma delas está no portfólio. Se está, soma 1, se não, soma 0.

In [None]:
N = 10
tmp = []
n = 0
Nu = 10
times = []
for row in df_ep.sample(n=Nu).iterrows():
    t = time()
    n += 1
    print("Empresa {:,}/{:,}.".format(n,Nu), end='\r')
    recs = ex_algo.recomender([row[1].id],k=N)
    tmp.append(any([x in df_ep.loc[df_ep.P == row[1].P].id.to_list() for x in recs])*1)
    times.append(time()-t)

In [17]:
sum(tmp)

0

In [22]:
pd.DataFrame(times, columns=['time']).describe()

Unnamed: 0,time
count,10.0
mean,19.590036
std,3.427845
min,16.35698
25%,17.279349
50%,17.821092
75%,21.948267
max,26.384208


Item 2 pronto, mas com um alto tempo para calcular as distâncias.
Conferir: [scipy.spatial.distance.cdist](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html)

In [6]:
%load_ext pythran.magic

Primeiro, normalizar os dados, ficando ente $[0, 1]$.
Assim, posso escalar para 255 para poder usar o formato unsigned int 8 bits

In [37]:
%%pythran -fopenmp
#pythran export normalizeitor(float64[][])
#pythran export transformer(float64[][])
def normalizeitor(x):
    return (x -x.min())/(x.max() - x.min() +1e-10)
def transformer(U):
    for i in range(len(U)):
        U[i] = 127*normalizeitor(U[i])
    return U

In [8]:
%%pythran -fopenmp
#pythran export pairwise_distance_pythran(int8[][])
def pairwise_distance_pythran(X):
    return abs(X[:, None, :] - X).sum(-1)

In [27]:
%%pythran -fopenmp
#pythran export vector_distance_pythran(int8[][],int8[])
def vector_distance_pythran(X,vec):
    return abs(X - vec).sum(-1)

In [10]:
U = ex_algo.pu.astype(np.float32)
Un = transformer(U).astype(np.int8)

In [None]:
M = pairwise_distance_pythran(Un[:10])

In [28]:
D = vector_distance_pythran(Un,Un[0])

In [32]:
D.shape

(462298,)

In [18]:
M = Un[:10]
vec = Un[0]

In [22]:
abs(M -vec).sum(-1)

array([  0, 459, 464,  30, 195, 217, 407, 472,  20, 299])

In [24]:
vec

array([  3,  30,  86, 127,   0], dtype=int8)

In [25]:
M-vec

array([[   0,    0,    0,    0,    0],
       [ 115,   20,  -86, -111,  127],
       [ 124,   21,  -86, -108,  125],
       [  -3,    9,    7,    0,   11],
       [  -3,   72,   41,   -9,   70],
       [  -3,   81,   41,   -9,   83],
       [ 124,    5,  -86,  -94,   98],
       [ 124,   23,  -86, -118,  121],
       [  -3,    3,    6,    0,    8],
       [  96,   97,   27,  -79,    0]], dtype=int8)

In [41]:
def _get_neighbors_2(self,uid: Uid, k: int = 1, black_list: List[Uid] = []) -> List[Uid]:
    """
        Calcula todas as distâncias entre 'uid' de entrada e todos os outros 'uid'.
        A distância calciulada é armazenda e não calculada novamente. 
    """
    black_list.append(uid)
    k = k if k >= 0 else 0
    #logger.info("Calculando todos os vizinhos...")
    #for uid2 in tqdm(self.trainset.all_users()):
    if uid not in self.matrix_dict_2.keys():
        Un = transformer(self.pu).astype(np.int8)
        self.matrix_dict_2[uid] = vector_distance_pythran(Un,Un[uid])
    out = [x[0] for x in sorted(
        [
            (uid2, self.matrix_dict_2[uid][uid2])
            for uid2 in self.trainset.all_users()
            if (uid2 not in black_list)
        ], key=lambda x: x[1])][:k-1]
    if self.stateless:
        del self.matrix_dict
        self.matrix_dict = {}
    return out

In [46]:
ex_algo.matrix_dict_2 = {}

In [43]:
_get_neighbors_2(ex_algo, 1, 10)

[458313, 461717, 457608, 460853, 458691, 461713, 461716, 39740, 311305]

In [54]:
N = 10
tmp = []
n = 0
Nu = 10
times = []
for row in df_ep.sample(n=Nu).iterrows():
    t = time()
    n += 1
    print("Empresa {:,}/{:,}.".format(n,Nu), end='\r')
    recs = ex_algo.recomender([row[1].id],k=N)
    tmp.append(any([x in df_ep.loc[df_ep.P == row[1].P].id.to_list() for x in recs])*1)
    times.append(time()-t)

Empresa 10/10.

In [55]:
pd.DataFrame(times, columns=['time']).describe()

Unnamed: 0,time
count,10.0
mean,0.592178
std,0.008578
min,0.57853
25%,0.585788
50%,0.594603
75%,0.597464
max,0.605622


```
Como estava:
		time
count	  10.000000
mean	   19.590036
std		3.427845
min		16.356980
25%		17.279349
50%		17.821092
75%		21.948267
max		26.384208


Com Pythran:
		time
count	  10.000000
mean	   0.576958
std		0.006733
min		0.569525
25%		0.573426
50%		0.575135
75%		0.578402
max		0.593960
````


In [51]:
0.57*len(ex_algo.pu)/3600

73.19718333333333

In [60]:
N = 10
tmp = []
n = 0
Nu = 10
times = []
for row in df_ep.head(Nu).iterrows():
    t = time()
    n += 1
    print("Empresa {:,}/{:,}.".format(n,Nu), end='\r')
    recs = ex_algo.recomender([row[1].id],k=N)
    tmp.append(any([x in df_ep.loc[df_ep.P == row[1].P].id.to_list() for x in recs])*1)
    times.append(time()-t)

Empresa 10/10.

In [61]:
pd.DataFrame(times, columns=['time']).describe()

Unnamed: 0,time
count,10.0
mean,0.558123
std,0.018059
min,0.542547
25%,0.545548
50%,0.552976
75%,0.562632
max,0.602976


In [62]:
ex_algo.matrix_dict_2.keys()

dict_keys([70457, 224870, 398206, 431841, 310159, 88748, 316063, 396914, 409708, 94043, 430209, 188514, 19818, 285936, 132650, 363685, 418930, 193640, 40183, 421297, 328534, 73092, 121589, 424547, 74114, 453793, 54930, 405059, 160572, 432261, 369, 604, 1703, 2521, 5788, 6369, 7822, 8949, 9272, 9946])

**Agora que o tempo médio de fazer as recomendações reduziu de $20s$ para $0.6s$, posso dier que o item 2 está pronto**.

In [63]:
20/0.6

33.333333333333336