# Experiment F

2) Implementar um framework de busca de hiperparâmetros.

2.1) Parâmetros específicos para cada método de processamento do ds.

2.2) N top colunas (`top_cols`) do dataset.

2.3) Parâmetro $L$ (`recomender(...,L,...)`).

5) Implementar como entrada uma empresa nova, conter mapeamento de valores.

Talvez criar o notebook da Second_View, com:
1) Verificar `sklearn.inspection.permutation_importance`.

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
from typing import NewType, List
from loguru import logger
import pandas as pd
import numpy as np
import jax.numpy as npj
from jax import jit
from time import time
from scipy.spatial.distance import cosine
from sklearn.decomposition import FactorAnalysis, FastICA, PCA, IncrementalPCA, NMF, TruncatedSVD
from collections import defaultdict, Counter
import functools
import operator
from copy import deepcopy
from tqdm import tqdm

In [3]:
# Run this cell before close.
%watermark
%watermark -p loguru
%watermark -p jax
%watermark -p sklearn
%watermark --iversion
%watermark -b -r -g

2020-07-05T03:18:34+00:00

CPython 3.7.7
IPython 7.16.1

compiler   : GCC 8.3.0
system     : Linux
release    : 5.4.0-7634-generic
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit
loguru 0.5.1
jax 0.1.72
sklearn 0.23.1
numpy  1.19.0
pandas 1.0.5

Git hash: 020136c9d00459366895fa09dee8bd7680b5d7a9
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


In [4]:
logger.info("Carregando dataset...")

path_data = '../data/'
df_marked = pd.read_csv(path_data+'estaticos_market.csv')
df_marked = df_marked.drop(columns=['Unnamed: 0'])
logger.info("...pronto!")

logger.info("Carregando dataset de validação...")

df_ep_list = [pd.read_csv(path_data+'estaticos_portfolio{}.csv'.format(i+1)) for i in range(3)]
tmp = []
for i in range(3):
    df_ep_list[i]['P'] = i+1 
    tmp.append(df_ep_list[i][['id','P']])
df_ep = pd.concat(tmp)
del df_ep_list
del tmp
logger.info("...pronto!")

# Para desenvolvimento do framework:
df_marked = df_marked.merge(df_ep, on='id')

2020-07-05 03:18:35.299 | INFO     | __main__:<module>:1 - Carregando dataset...
2020-07-05 03:18:41.766 | INFO     | __main__:<module>:6 - ...pronto!
2020-07-05 03:18:41.767 | INFO     | __main__:<module>:8 - Carregando dataset de validação...
2020-07-05 03:18:41.799 | INFO     | __main__:<module>:18 - ...pronto!


In [5]:
def flat(a):
    return functools.reduce(operator.iconcat, a, []) 

def feat_proc(dataset = df_marked, col_target = 'id', feat_cols = df_marked.columns[2:], N_topcols = -1):
    logger.info("Processando as features...")
    missing_count = {}
    remove_cols = []
    for col in feat_cols:
        try:
            missing_count[col] = sum(dataset[col].isna()) / dataset[col].nunique()
            dataset[col] = dataset[col].fillna(0)*1
        except ZeroDivisionError:
            remove_cols.append(col)

    feat_cols = [col for col in feat_cols if col not in remove_cols]

    def normalize(x):
        return (x-np.min(x))/(np.max(x) - np.min(x)) if (np.max(x) - np.min(x)) > 0 else (x-np.min(x))

    for col in feat_cols:
        try:
            dataset[col] = normalize(dataset[col].tolist())
        except:
            maping = {val:i+1 for i,val in enumerate(dataset[col].unique())}
            dataset[col] = dataset[col].apply(lambda x: maping[x])
            dataset[col] = normalize(dataset[col].tolist())

    remove_cols = []
    for col in feat_cols:
        if df_marked[col].nunique() == 1:
            remove_cols.append(col)
    feat_cols = [col for col in feat_cols if col not in remove_cols]
    N_topcols = N_topcols if (N_topcols > 0) and (N_topcols <= len(feat_cols)) else -1
    feat_cols_vals = [(col,val) for col,val in list(missing_count.items()) if col in feat_cols]
    if N_topcols == -1:
        top_cols = feat_cols
    else:
        top_cols = [col for col,_ in sorted(feat_cols_vals, key=lambda x: x[1])[:N_topcols]]
    
    missing_count = {key:val for key,val in missing_count.items() if col in feat_cols }
    logger.info("...pronto!")
    return dataset[[col_target]+top_cols], missing_count



def Manhattan(X,vec):
    return abs(X - vec).sum(-1)

def Camberra(X,vec):
    return abs((X - vec)/(X + vec)).sum(-1)

def BrayCurtis(X,vec):
    return abs((X - vec)).sum(-1) / abs((X - vec)).sum(-1).sum(-1)

def np_cossine(X,vec):
    return np.array([sum(X[i]*vec) / sum(X[i]**2)*sum(vec**2) for i in range(X.shape[0])])

def npj_cossine(X,vec):
    return npj.array([sum(X[i]*vec) / sum(X[i]**2)*sum(vec**2) for i in range(X.shape[0])])

def scy_cossine(X,vec):
    return np.array([cosine(X[i],vec) for i in range(X.shape[0])])

Manhattanj = jit(Manhattan)
Camberraj = jit(Camberra)
BrayCurtisj = jit(BrayCurtis)
np_cossinej = jit(npj_cossine)

dist_func = [Manhattan, Camberra, BrayCurtis, np_cossine, scy_cossine]
tmp = [Manhattanj, Camberraj, BrayCurtisj, np_cossinej]

for dist in tmp:
    dist.__name__ += 'j'

dist_func = dist_func+tmp
del tmp

def Nothing(arg):
    return arg

def npSVD(M):
    u, _, _ = np.linalg.svd(M, full_matrices=False)
    return u

def npSVDj(M):
    u, _, _ = npj.linalg.svd(M, full_matrices=False)
    return u

# Mais rápido!
_npSVDj = jit(npSVDj)


def _PCA(M,n_components=None):
    out = PCA(n_components=n_components)
    return out.fit_transform(M)

def _FastICA(M,n_components=None):
    out = FastICA(n_components=n_components)
    return out.fit_transform(M)

def _FactorAnalysis(M,n_components=None):
    out = FactorAnalysis(n_components=n_components)
    return out.fit_transform(M)

def _IncrementalPCA(M,n_components=None):
    out = IncrementalPCA(n_components=n_components)
    return out.fit_transform(M)

def _TruncatedSVD(M,n_components=None):
    out = TruncatedSVD(n_components=n_components)
    return out.fit_transform(M)

def _NMF(M,n_components=None):
    out = NMF(n_components=n_components)
    return out.fit_transform(M)

redux_func = [Nothing, npSVD, _npSVDj, _NMF, _TruncatedSVD, _IncrementalPCA, _FactorAnalysis, _FastICA, _PCA]

data, missing_count = feat_proc()

def escalaropt_missings(df: pd.DataFrame, score: dict):
    df_score = pd.DataFrame(score.items(), columns=['col','score'])
    df_score = pd.DataFrame(missing_count.items(), columns=['col','score'])
    df_score['escala_opt'] = 1-normalize((np.sqrt(df_score.score)))
    #df_score['escala_opt'].sort_values().reset_index(drop=True).plot()
    #df_score['escala_opt'].apply(lambda x: max(x,0.1)).sort_values().reset_index(drop=True).plot()
    df_score['escala_opt'] = df_score['escala_opt'].apply(lambda x: max(x,0.1))
    for row in df_score.iterrows():
        df[row.col] = row.escala_opt*df[row.col]
    return df

def escalaropt_std(df: pd.DataFrame, score: dict):
    df_score = pd.DataFrame(score.items(), columns=['col','score'])
    df_score = pd.DataFrame(missing_count.items(), columns=['col','score'])
    df_score['escala_opt'] = normalize([np.sqrt(np.sqrt(np.sqrt(df[col].std()))) for col in df_score['col']])
    #df_score['escala_opt'].sort_values().reset_index(drop=True).plot()
    #df_score['escala_opt'].apply(lambda x: max(x,0.1)).sort_values().reset_index(drop=True).plot()
    df_score['escala_opt'] = df_score['escala_opt'].apply(lambda x: max(x,0.1))
    for row in df_score.iterrows():
        df[row.col] = row.escala_opt*df[row.col]
    return df

def escalaropt_entropy(df: pd.DataFrame, score: dict):
    df_score = pd.DataFrame(score.items(), columns=['col','score'])
    df_score = pd.DataFrame(missing_count.items(), columns=['col','score'])
    df_score['escala_opt'] = normalize([(-sum((df[col]+1)*np.log(df[col]+1))) for col in df_score['col']])
    #df_score['escala_opt'].sort_values().reset_index(drop=True).plot()
    #df_score['escala_opt'].apply(lambda x: max(x,0.1)).sort_values().reset_index(drop=True).plot()
    df_score['escala_opt'] = df_score['escala_opt'].apply(lambda x: max(x,0.1))
    for row in df_score.iterrows():
        df[row.col] = row.escala_opt*df[row.col]
    return df

procDS_func = [Nothing, escalaropt_missings, escalaropt_std, escalaropt_entropy]

Uid = NewType('uid', int)
Raw = NewType('raw', str)

class ExMatrix():
    """
        ************
    """
    def __init__(self,process_values = Nothing, factorize = Nothing, vector_distance = Manhattan, stateless: bool = False):
        self.matrix_dict = {}
        self.stateless = stateless
        self.M = None
        self.pu = None
        self.raw = None
        self.uid = None
        self.vector_distance = vector_distance
        self.factorize = factorize
        self.process_values = Nothing

    def fit(self,dataset: pd.DataFrame):
        """
            ...
        """
        self.raw = dataset[dataset.columns[0]].to_dict()
        self.uid = {raw:uid for uid,raw in self.raw.items()}
        self.all_raw = dataset[dataset.columns[0]].tolist()
        self.all_uid = dataset.index
        dataset = self.process_values(dataset)
        ds_size = len(dataset[dataset.columns[1:]].values)
        self.M = self.factorize(dataset[dataset.columns[1:]].values)
        if ds_size != self.M.shape[0]:
            raise ValueError('A fatoração não está correta!')
        del dataset
        
    def _get_neighbors(self,uid: Uid, k: int = 1, black_list: List[Uid] = []) -> List[Uid]:
        """
            Calcula todas as distâncias entre 'uid' de entrada e todos os outros 'uid'.
            A distância calciulada é armazenda e não calculada novamente. 
        """
        k = k if k >= 0 else 0
        #logger.info("Calculando todos os vizinhos...")
        #for uid2 in tqdm(self.trainset.all_users()):
        if uid not in self.matrix_dict.keys():
            self.matrix_dict[uid] = self.vector_distance(self.M,self.M[uid])
        out = [x[0] for x in sorted(
            [
                (uid2, self.matrix_dict[uid][uid2])
                for uid2 in self.all_uid
                if (uid2 not in black_list)
            ], key=lambda x: x[1])][:k]
        if self.stateless:
            del self.matrix_dict
            self.matrix_dict = {}
        return out
    
    def _uid2raw(self, uid: Uid)-> str:
        '''
            uid -> raw.
            Valor interno para externo, o nome original do usuário.
        '''
        return self.raw[uid]
    
    def _raw2uid(self, raw: Raw)-> int:
        '''
            raw -> uid.
            Valor externo para interno, o id interno do usuários..
        '''
        return self.uid[raw]
    
    def recomender(self, in_list: List[Raw], k: int = 1, L: int = 3, Fk: int = 1, limit: int = 100)-> List[Raw]:
        '''
            Faz as recomendacoes.
            ##### Função incompleta #####
        '''
        # Pega quantas recomendações por usuário em `in_list`,
        # mas sem deixar faltar
        N_in = len(in_list)
        k = k if k > 0 else 1
        R_per_in = L*(k//N_in + min(k%N_in,1))

        # Pega os `uid`
        uid_in_list = [self._raw2uid(raw) for raw in in_list]

        # Pega os vizinhos mais próximos de cada uid de entrada.
        done = False
        flag = True
        Rounds = 0
        while limit and (not done):
            Rounds += 1
            # Ele sempre pega todos novamente.
            recomendations_list = [self._get_neighbors(uid,R_per_in,uid_in_list) for uid in uid_in_list]
            # Quando limit = 0, encerra.
            limit -= 1
            # Quando tem gente o suficiente, encerra.
            if len(set(flat(recomendations_list))) >= Fk*k:
                done = True
            # Depois do primeiro loop, pega um a mais.
            R_per_in += 1

        # Aqui gera um dicionário ordenando por votacao.
        count_rec = Counter(flat(recomendations_list)) # A votação!!
        count_rec = list(count_rec.items())
        ct_pos = defaultdict(list)
        #ct_pos_inv = defaultdict(list)
        while count_rec:
            tmp = count_rec.pop(0)
            ct_pos[tmp[1]].append(tmp[0])
            #ct_pos_inv[tmp[0]].append(tmp[1])

        # Aqui considera a posiçao de vizinhos mais proximos.
        #nn_pos = defaultdict(list)
        nn_pos_inv = defaultdict(list)
        tmp = deepcopy(recomendations_list)
        while tmp:
            tmp2 = tmp.pop(0)
            n = 0
            while tmp2:
                n += 1
                tmp3 = tmp2.pop(0)
                #nn_pos[n].append(tmp3)
                nn_pos_inv[tmp3].append(n)

        # Vai separando por votação e ordem de proximidade como desempate.      
        votos_list = list(ct_pos.keys())
        out_uid = []
        while votos_list and k:
            votos = max(votos_list)
            votos_list.remove(votos)
            tmp = sorted([(tmp, min(nn_pos_inv[tmp])) for tmp in ct_pos[votos]], key=lambda x: x[1])
            while tmp and k:
                out_uid.append(tmp.pop(0)[0])
                k -= 1

        # converte para Raw e "joga fora".
        return [self._uid2raw(uid) for uid in out_uid]


2020-07-05 03:18:58.236 | INFO     | __main__:feat_proc:5 - Processando as features...
2020-07-05 03:18:58.527 | INFO     | __main__:feat_proc:41 - ...pronto!


In [6]:
def Search(N=1, process_values = Nothing, factorize = Nothing, vector_distance_list = [Manhattan]):
    ex_algo = ExMatrix(process_values = process_values, factorize = factorize)
    ex_algo.fit(data)

    out = {}
    for dist in vector_distance_list:
        ex_algo.vector_distance = dist
        tmp ={1: [], 2: [], 3: []}
        for row in tqdm(df_ep.sample(frac=0.01).iterrows()):
            recs = ex_algo.recomender([row[1].id],k=N)
            tmp[row[1].P].append(any([x in df_ep.loc[df_ep.P == row[1].P].id.to_list() for x in recs])*1)
        out[dist.__name__] = {i: (sum(val)/len(val), sum(val), len(val)) for i,val in tmp.items()}
    return out

In [7]:
dist_list = [Manhattan, Camberraj, BrayCurtisj, cossine_spy]
proc_list = procDS_func
redux_list = redux_func
n_components_dict = {Nothing.__name__ : False,
                  _npSVDj.__name__: False,
                  _NMF.__name__ : True,
                  _TruncatedSVD.__name__ : True,
                  _IncrementalPCA.__name__ : True,
                  _FactorAnalysis.__name__ : True,
                  _FastICA.__name__ : True,
                  _PCA.__name__ : True}

In [None]:
n_components_list = [10, 20, 30]

results = Search(process_values = Nothing, factorize = Nothing, vector_distance_list= dist_list)

In [None]:
results


In [12]:
ex_algo = ExMatrix(vector_distance=cossine_spy)
ex_algo.fit(data)

In [None]:
for row in tqdm(df_ep.sample(frac=0.01).iterrows()):
    recs = ex_algo.recomender([row[1].id],k=1)

In [9]:
ex_algo._get_neighbors(1,2)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [13]:
self = ex_algo
uid = 1
k = 1
k = k if k >= 0 else 0
black_list = [4, 7]
#logger.info("Calculando todos os vizinhos...")
#for uid2 in tqdm(self.trainset.all_users()):
if uid not in self.matrix_dict.keys():
    self.matrix_dict[uid] = self.vector_distance(self.M,self.M[uid])
out = [x[0] for x in sorted(
    [
        (uid2, self.matrix_dict[uid][uid2])
        for uid2 in self.all_uid
        if (uid2 not in black_list)
    ], key=lambda x: x[1])][:k]
if self.stateless:
    del self.matrix_dict
    self.matrix_dict = {}


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
a=     [
        (uid2, self.matrix_dict[uid][uid2])
        for uid2 in self.all_uid
        if (uid2 not in black_list)
    ]

In [None]:
len(a)

In [None]:
self.all_uid

In [16]:
self.vector_distance(self.M,self.M[uid])

array([[0.99354571, 0.99354571, 0.99354571, ..., 0.99354571, 0.99354571,
        0.99354571],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.38022919, 0.38022919, 0.38022919, ..., 0.38022919, 0.38022919,
        0.38022919],
       ...,
       [0.84666271, 0.84666271, 0.84666271, ..., 0.84666271, 0.84666271,
        0.84666271],
       [0.39395908, 0.39395908, 0.39395908, ..., 0.39395908, 0.39395908,
        0.39395908],
       [0.6306781 , 0.6306781 , 0.6306781 , ..., 0.6306781 , 0.6306781 ,
        0.6306781 ]])

In [15]:
cosine(self.M[uid],self.M[uid])

0.0

In [21]:
X = self.M
vec = self.M[uid]
O = []
for i in range(X.shape[0]):
    O.append(cosine(X[i],vec))


In [22]:
O

[0.9935457082169379,
 0.0,
 0.3802291936279122,
 0.8471478546924851,
 0.7399987269249657,
 0.7971363956704623,
 0.5580985522438253,
 0.6410108479631822,
 0.6552427164976655,
 0.7219037191039912,
 0.43195685854081245,
 0.40244082324946195,
 0.8166498268978177,
 0.4204418769448268,
 0.751687480049672,
 0.6069482333574354,
 0.44557393212922236,
 0.7609261999862933,
 0.8304435941910852,
 0.8373242660842417,
 0.28592201646965854,
 0.7717743075937932,
 0.7830045502604215,
 0.5021061848084716,
 0.8031477663709231,
 0.791797487811366,
 0.3663488744533213,
 0.8082107634177241,
 0.5545562105693671,
 0.8576717530594801,
 0.5679536405464326,
 0.45359040498126213,
 0.5536357577024614,
 0.8196570422680466,
 0.5545508197101667,
 0.7992655230745678,
 0.5689660326923076,
 0.6151544097290818,
 0.43347829114307257,
 0.4517642974377467,
 0.41804600816454507,
 0.7580772165289953,
 0.5018617623812456,
 0.5216004226095639,
 0.8014198856080832,
 0.8317332548681919,
 0.7821390977773561,
 0.7918007133059727,
 0