# Experimento B:

Goals:

1) Extender a classe de algum dos algoritmos no formato necessário para recomendações *user-user*.

2) Implementar o GridsSearch e validar seu uso para o nosso caso.

3) Adaptar uma varredura para diferentes quantidades de colunas acima do método do item 2.

O modelo receberá como entrar apenas o `id` da empresa e retornar uma lista do `N` mais recomendados (vizinhos mais próximos).

Verifiar: o modela terá entrada de empresas novas? Acho que não.

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [4]:
%load_ext watermark

In [5]:
import numpy as np
import pandas as pd
from surprise import SVD, accuracy, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from scipy.spatial.distance import cosine

from loguru import logger
from tqdm import tqdm

In [6]:
# Run this cell before close. 
%watermark
%watermark -p loguru
%watermark -p scipy
%watermark -p surprise
%watermark --iversion
%watermark -b -r -g

2020-06-20T03:10:52+00:00

CPython 3.7.7
IPython 7.15.0

compiler   : GCC 8.3.0
system     : Linux
release    : 5.4.0-7626-generic
machine    : x86_64
processor  : 
CPU cores  : 8
interpreter: 64bit
loguru 0.5.1
scipy 1.4.1
surprise 0.1
pandas 1.0.4
numpy  1.18.5

Git hash: 7b1e2fc1b78dd5c7a25b69c388ca74914c179850
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


In [7]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# Carrega e processa o dataset:

In [8]:
path_data = '../data/'
top_cols = pd.read_csv('top_cols.csv')['cols'].to_list()
df_marked = pd.read_csv(path_data+'estaticos_market.csv', usecols=top_cols)
col_user = 'id'
top_cols.remove(col_user)

rest_cols = []
for col in top_cols:
    df_marked[col] = df_marked[col].fillna(0)*1

def normalize(x):
    return (x-np.min(x))/(np.max(x) - np.min(x)) if (np.max(x) - np.min(x)) > 0 else (x-np.min(x))

escala = 100
for col in top_cols:
    df_marked[col] = (escala*normalize(df_marked[col].tolist())).astype(np.uint8)
    
remove_cols = []
for col in top_cols:
    if df_marked[col].nunique() == 1:
        remove_cols.append(col)

df_marked = df_marked.drop(columns=remove_cols)
for col in remove_cols:
    top_cols.remove(col)

df_marked = pd.melt(df_marked, id_vars=["id"], var_name="itemID", value_name="rating").rename(columns={"id": "userID"})

reader = Reader(rating_scale=(0, escala))
data = Dataset.load_from_df(df_marked[['userID', 'itemID', 'rating']], reader)
del df_marked

# Modelo base.

In [9]:
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [31]:
# We'll use the famous SVD algorithm.
algo = SVD(n_factors=10)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 21.2400


21.23996767620834

In [7]:
algo.trainset.all_users()

range(0, 462298)

In [35]:
algo.trainset.to_raw_uid(462298-1)

'5259af3be44eca994715ca12fb0c148f1744345b8997ea9c02255c69b86bfbeb'

In [32]:
algo.pu[4]

array([ 0.09813943, -0.61476839,  1.42834029,  1.29686844,  1.98346139,
        2.15739923,  2.05828586,  1.49075146,  0.2024511 ,  2.7566085 ])

In [69]:
class ExSVD(SVD):
    """Class docstrings go here."""
    
    def __init__(self,**args):
        self.matrix_dict = {}
        super().__init__(**args)

    def fit(self,trainset):
        """Class docstrings go here."""
        logger.info("Treinando modelo SVD...")
        super().fit(trainset)
        logger.info("Pronto!")
    
    def get_neighbors(self,uid,k=1):
        """Class docstrings go here."""
        
        for uid2 in tqdm(self.trainset.all_users()):
            ordered = tuple(sorted((uid,uid2)))
            if (uid != uid2) and (ordered not in self.matrix_dict.keys()):
                self.matrix_dict[ordered] = cosine(self.pu[uid],self.pu[uid2])
        return [x[0] for x in sorted(
            [
                (uid2, self.matrix_dict[tuple(sorted((uid,uid2)))]) 
                for uid2 in self.trainset.all_users()
                if uid != uid2
            ], key=lambda x: x[1])][:k-1]

        

In [70]:
# We'll use the famous SVD algorithm.
ex_algo = ExSVD(n_factors=10)

# Train the algorithm on the trainset, and predict ratings for the testset
ex_algo.fit(trainset)
predictions = ex_algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

2020-06-20 04:39:57.675 | INFO     | __main__:fit:10 - Treinando modelo SVD...
2020-06-20 04:42:25.725 | INFO     | __main__:fit:12 - Pronto!


RMSE: 21.1749


21.174898837427836

In [72]:
ex_algo.get_neighbors(4,10)

100%|██████████| 462298/462298 [00:00<00:00, 682471.30it/s]


[462271, 85, 455739, 462056, 455861, 455588, 456461, 450504, 455573]