# Experimento B:

Goals:

1) Extender a classe de algum dos algoritmos no formato necessário para recomendações *user-user*.

2) Implementar o GridsSearch e validar seu uso para o nosso caso.

3) Adaptar uma varredura para diferentes quantidades de colunas acima do método do item 2.

O modelo receberá como entrar apenas o `id` da empresa e retornar uma lista do `N` mais recomendados (vizinhos mais próximos).

Verifiar: o modela terá entrada de empresas novas? Acho que não.

- Author: Israel Oliveira [\[e-mail\]](mailto:'Israel%20Oliveira%20'<prof.israel@gmail.com>)

In [1]:
%load_ext watermark

In [2]:
import numpy as np
import pandas as pd
from surprise import SVD, accuracy, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from scipy.spatial.distance import cosine

from loguru import logger
from tqdm import tqdm

In [3]:
# Run this cell before close. 
%watermark
%watermark -p loguru
%watermark -p scipy
%watermark -p surprise
%watermark --iversion
%watermark -b -r -g

2020-06-20T15:43:20+00:00

CPython 3.7.7
IPython 7.15.0

compiler   : GCC 8.3.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : 
CPU cores  : 16
interpreter: 64bit
loguru 0.5.0
scipy 1.4.1
surprise 0.1
numpy  1.18.5
pandas 1.0.4

Git hash: 09918739ea7206ad789c587fbff1730e9bc26e62
Git repo: https://github.com/ysraell/aceleradev_private.git
Git branch: master


In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# Carrega e processa o dataset:

In [5]:
path_data = '../data/'
top_cols = pd.read_csv('top_cols.csv')['cols'].to_list()
df_marked = pd.read_csv(path_data+'estaticos_market.csv', usecols=top_cols)
col_user = 'id'
top_cols.remove(col_user)

rest_cols = []
for col in top_cols:
    df_marked[col] = df_marked[col].fillna(0)*1

def normalize(x):
    return (x-np.min(x))/(np.max(x) - np.min(x)) if (np.max(x) - np.min(x)) > 0 else (x-np.min(x))

escala = 100
for col in top_cols:
    df_marked[col] = (escala*normalize(df_marked[col].tolist())).astype(np.uint8)
    
remove_cols = []
for col in top_cols:
    if df_marked[col].nunique() == 1:
        remove_cols.append(col)

df_marked = df_marked.drop(columns=remove_cols)
for col in remove_cols:
    top_cols.remove(col)

df_marked = pd.melt(df_marked, id_vars=["id"], var_name="itemID", value_name="rating").rename(columns={"id": "userID"})

reader = Reader(rating_scale=(0, escala))
data = Dataset.load_from_df(df_marked[['userID', 'itemID', 'rating']], reader)
del df_marked

# Modelo base.

In [None]:
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [None]:
# We'll use the famous SVD algorithm.
algo = SVD(n_factors=10)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

In [None]:
algo.trainset.all_users()

In [None]:
algo.trainset.to_raw_uid(462298-1)

In [None]:
algo.pu[4]

In [None]:
class ExSVD(SVD):
    """
        Classe extendida da surprise.SVD.
        
    
    """
    
    def __init__(self,**args):
        self.matrix_dict = {}
        super().__init__(**args)

    def fit(self,trainset):
        """
            Reimplementei a SVD.fit para colocar um logger nível INFO.
        """
        logger.info("Treinando modelo SVD...")
        super().fit(trainset)
        logger.info("Pronto!")
    
    def get_neighbors(self,uid,k=1):
        """
            Calcula todas as distâncias entre 'uid' de entrada e todos os outros 'uid'.
            A distância calciulada é armazenda e não calculada novamente. 
        """
        logger.info("Calculando todos os vizinhos...")
        for uid2 in tqdm(self.trainset.all_users()):
            ordered = tuple(sorted((uid,uid2)))
            if (uid != uid2) and (ordered not in self.matrix_dict.keys()):
                self.matrix_dict[ordered] = cosine(self.pu[uid],self.pu[uid2])
        return [x[0] for x in sorted(
            [
                (uid2, self.matrix_dict[tuple(sorted((uid,uid2)))]) 
                for uid2 in self.trainset.all_users()
                if uid != uid2
            ], key=lambda x: x[1])][:k-1]

        

In [None]:
# We'll use the famous SVD algorithm.
ex_algo = ExSVD(n_factors=10, verbose=True)

# Train the algorithm on the trainset, and predict ratings for the testset
ex_algo.fit(trainset)
predictions = ex_algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Dado o `uid` (*o id de usuário no modelo*), retorna uma lista dos `uid`s mais próximos (`List[uid]`).

In [None]:
ex_algo.get_neighbors(4,10)

**Item 1 completo.**

In [6]:
param_grid = {
    'n_factors': [5, 10],
    'n_epochs': [2, 3],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2, n_jobs=4, joblib_verbose=1)
logger.info("Running GS...")
gs.fit(testset)
logger.info("...pronto!")

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

2020-06-20 15:44:26.145 | INFO     | __main__:<module>:9 - Running GS...
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [7]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())
results_df = pd.DataFrame.from_dict(gs.cv_results)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator'

In [None]:
results_df.to_csv('results_000.csv')

In [8]:
?GridSearchCV

[0;31mInit signature:[0m
[0mGridSearchCV[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0malgo_class[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparam_grid[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmeasures[0m[0;34m=[0m[0;34m[[0m[0;34m'rmse'[0m[0;34m,[0m [0;34m'mae'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrefit[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreturn_train_measures[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpre_dispatch[0m[0;34m=[0m[0;34m'2*n_jobs'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoblib_verbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
The :class:`GridSearchCV` class computes accuracy metrics for an
algorithm on various combinati