# ExMatrix
### Como usar o sistema de recomendação.

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(1, 'src/')
from train import *
from recommender import Recommender 

from tqdm import tqdm
import pandas as pd
from time import time

In [3]:
%load_ext watermark

In [4]:
# Run this cell before close.
%watermark -d --iversion -b -r -g -m -v

pandas 1.0.5
numpy  1.19.0
2020-07-26 

CPython 3.7.8
IPython 7.16.1

compiler   : GCC 8.3.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : 
CPU cores  : 16
interpreter: 64bit
Git hash   : a9d329fb3e728b631c4fe40ff17a31867230a633
Git repo   : https://github.com/ysraell/aceleradev_private.git
Git branch : master


# Treinando o modelo.
Dataset em `./data/`, salvando o modelo em `./model/`.

In [5]:
t = time()
train('data/','model/')
dt = time() - t
print("Tempo total de treino: {:.2f} min.".format(dt/60))

2020-07-26 22:13:12.235 | INFO     | utils:load_dataset:82 - Carregando dataset de treino...
2020-07-26 22:13:24.471 | INFO     | utils:load_dataset:91 - ...pronto!
2020-07-26 22:13:24.472 | INFO     | utils:feat_proc:128 - Processando as features...
2020-07-26 22:13:55.702 | INFO     | utils:feat_proc:163 - ...pronto!
2020-07-26 22:13:56.244 | INFO     | train:train:35 - Treinando e modelo...
2020-07-26 22:13:56.404 | INFO     | model:fit:69 - Processando valores.
2020-07-26 22:14:02.899 | INFO     | model:fit:72 - Fatorizando.
2020-07-26 22:24:32.391 | INFO     | model:fit:77 - Matriz pronta.
2020-07-26 22:24:32.392 | INFO     | train:train:37 - ...pronto.
2020-07-26 22:24:35.685 | INFO     | train:train:42 - Model salvo: "model/model_default.pkl"..


Tempo total de treino: 11.39 min.


# Carrega o datset para demonstração.

In [6]:
data, portfolios = load_dataset(path_data = 'data/', train_test_merged = True)

2020-07-26 22:24:35.946 | INFO     | utils:load_dataset:82 - Carregando dataset de treino...
2020-07-26 22:24:45.903 | INFO     | utils:load_dataset:91 - ...pronto!
2020-07-26 22:24:45.905 | INFO     | utils:load_dataset:95 - Carregando dataset de teste...
2020-07-26 22:24:45.920 | INFO     | utils:load_dataset:104 - ...pronto!


# Carrega o modelo.

In [7]:
model = Recommender('model/')

2020-07-26 22:24:47.521 | INFO     | recommender:__init__:33 - Modelo carregado


#### Exemplo de um ID:

In [8]:
portfolios.loc[0]

id    09e95c1a84049001d086470a8f320a19b076f955a89122...
P                                                     1
Name: 0, dtype: object

In [9]:
portfolios.loc[portfolios.id == model.For([portfolios.id.loc[0]])[0]]

Unnamed: 0,id,P
473,b9ecb6629e14d5e760685521d1394e6ed76a563f770e61...,1


## Recomendações 1-K
- Entra `1` ID e saem `K` recomendações.

In [10]:
def one_N(N,K):
    tmp = {1: [], 2: []}
    if N > 0:
        for row in tqdm(portfolios.sample(N,random_state=2).iterrows(), total=N):
            recs = model.For([row[1].id],k=K)
            tmp[row[1].P].append(any([x in portfolios.loc[portfolios.P == row[1].P].id.to_list() for x in recs])*1)
    else:
        for row in tqdm(portfolios.iterrows(), total=portfolios.shape[0]):
            recs = model.For([row[1].id],k=K)
            tmp[row[1].P].append(any([x in portfolios.loc[portfolios.P == row[1].P].id.to_list() for x in recs])*1)
    return {i: (len(val), sum(val), round(100*sum(val)/max(1,len(val)),2)) for i,val in tmp.items()}

In [19]:
results = {}
for K in [1, 5, 10]:
    results[K] = one_N(0,K)

100%|██████████| 831/831 [16:22<00:00,  1.18s/it]
100%|██████████| 831/831 [16:37<00:00,  1.20s/it]
100%|██████████| 831/831 [16:12<00:00,  1.17s/it]


 Considerando apenas os portfólios 2 e 3, numerados abaixo como 1 e 2 (resp.).

In [20]:
def show_results(results):
    return pd.DataFrame([[K]+flat([list(a) for a in r.values()]) for K,r in results.items()], columns=['K', 'Total_1', 'Acertos_1', 'Percentual_1 (%)', 'Total_2', 'Acertos_2', 'Percentual_2 (%)'])

In [21]:
show_results(results)

Unnamed: 0,K,Total_1,Acertos_1,Percentual_1 (%),Total_2,Acertos_2,Percentual_2 (%)
0,1,566,136,24.03,265,65,24.53
1,5,566,369,65.19,265,168,63.4
2,10,566,458,80.92,265,208,78.49


## Recomendações S-K
- Entra `S` IDs e saem `K` recomendações.

In [22]:
from tqdm import tqdm

def more_N(N,S,K):
    tmp = {1: [], 2: []}
    for n in tqdm(range(N)):
        for P in portfolios.P.unique():
            condP = portfolios.P == P
            df_tmp = portfolios.loc[condP].sample(n=S)
            recs = model.For(df_tmp.id.to_list(),k=K)
            tmp[P].append(any([x in portfolios.loc[condP].id.to_list() for x in recs])*1)
    return {i: (len(val), sum(val), round(100*sum(val)/max(1,len(val)),2)) for i,val in tmp.items()}

### 5-K

In [23]:
results2 = {}
for K in [1, 5, 10]:
    results2[K] = more_N(100,5,K)

100%|██████████| 100/100 [19:41<00:00, 11.81s/it]
100%|██████████| 100/100 [19:38<00:00, 11.79s/it]
100%|██████████| 100/100 [19:36<00:00, 11.76s/it]


In [24]:
show_results(results2)

Unnamed: 0,K,Total_1,Acertos_1,Percentual_1 (%),Total_2,Acertos_2,Percentual_2 (%)
0,1,100,23,23.0,100,23,23.0
1,5,100,70,70.0,100,70,70.0
2,10,100,91,91.0,100,86,86.0


## 10-K

In [25]:
results3 = {}
for K in [1, 5, 10]:
    results3[K] = more_N(100,10,K)

 97%|█████████▋| 97/100 [39:32<01:13, 24.46s/it]


KeyboardInterrupt: 

In [None]:
show_results(results3)

In [27]:
model.model.M.shape

(462298, 62)