<a href="https://colab.research.google.com/github/tianygoulart/Machine-Leaning/blob/main/desafioM3ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from scipy.spatial.distance import pdist, hamming, cosine


In [2]:
def cosine_similarity(x: np.array, y: np.array):
  cosine_sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))

  return cosine_sim

In [3]:
def array_centering(v):
  """Subtraindo dos elementos não nulos pela media."""
  # copia para evitar sobreescrita
  v = v.copy()
  # idexação para extrair elementos não nulos
  non_zeros = v>0
  # substituição pela media
  v[non_zeros]=v[non_zeros]-np.mean(v[non_zeros]) + 1e-6
  return v 

In [4]:
def centered_cosine_similarity(x,y):
  """Calcula a similaridade de cossenos centralizados entre arrays x e y."""
  # subtarindo a media
  x = array_centering(x)
  y = array_centering(y)
  # similaridade por cossenos
  centered_cosine_sim = cosine_similarity(x,y)
  return centered_cosine_sim

In [5]:
ratings = np.array([[4.0,0.0,0.0,4.7,1.0,0.0,0.0],
                   [5.0,4.5,4.0,0.0,0.0,0.0,0.0],
                   [0.0,0.0,0.0,1.5,5.0,4.0,0.0],
                   [4.1,3.0,0.0,4.9,0.0,0.0,3.0],
                   [1.0,4.0,0.0,2.5,3.8,1.0,5.0]])
columns = ['I'+str(i) for i in range(ratings.shape[1])]
index = ['U'+str(i) for i in range(ratings.shape[0])]

In [6]:
ratings = pd.DataFrame(data=ratings, columns=columns, index=index, dtype=float)

In [7]:
ratings

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6
U0,4.0,0.0,0.0,4.7,1.0,0.0,0.0
U1,5.0,4.5,4.0,0.0,0.0,0.0,0.0
U2,0.0,0.0,0.0,1.5,5.0,4.0,0.0
U3,4.1,3.0,0.0,4.9,0.0,0.0,3.0
U4,1.0,4.0,0.0,2.5,3.8,1.0,5.0


In [8]:
x = ratings.loc['U0'].values
y = ratings.loc['U1'].values

In [9]:
#Qual o valor da similaridade de cossenos entre os usuários U1 e U2?
cos_sim_U0U1 = cosine_similarity(x,y)
print(f"A similaridade de cossenos entre U0 e U1 é {cos_sim_U0U1:.2f}.")

A similaridade de cossenos entre U0 e U1 é 0.41.


In [10]:
x = ratings.loc['U0'].values
y = ratings.loc['U2'].values

In [11]:
#Qual o valor da similaridade de cossenos entre os usuários U1 e U3?
cos_sim_U0U2 = cosine_similarity(x,y)
print(f"A similaridade de cossenos entre U0 e U2 é {cos_sim_U0U2:.2f}.")


A similaridade de cossenos entre U0 e U2 é 0.29.


In [12]:
x = ratings.loc['U0'].values
y = ratings.loc['U3'].values

In [13]:
#Qual o valor da similaridade de cossenos entre os usuários U1 e U4?
cos_sim_U0U3 = cosine_similarity(x,y)
print(f"A similaridade de cossenos entre U0 e U3 é {cos_sim_U0U3:.2f}.")


A similaridade de cossenos entre U0 e U3 é 0.82.


In [14]:
x = ratings.loc['U0'].values
y = ratings.loc['U4'].values

In [15]:
#Qual o valor da similaridade de cossenos entre os usuários U1 e U5?
cos_sim_U0U4 = cosine_similarity(x,y)
print(f"A similaridade de cossenos entre U0 e U4 é {cos_sim_U0U4:.2f}.")


A similaridade de cossenos entre U0 e U4 é 0.39.


In [16]:
x = ratings.loc['U0'].values
y = ratings.loc['U1'].values

In [17]:
#Qual o valor da similaridade de cossenos centralizada entre os usuários U1 e U2?
centered_cos_sim_U0U1 = centered_cosine_similarity(x, y)
print(f'A similaridade de cossenos centralizada entre U0 e U1 é {centered_cos_sim_U0U1:.2f}.')


A similaridade de cossenos centralizada entre U0 e U1 é 0.20.


In [18]:
x = ratings.loc['U0'].values
y = ratings.loc['U2'].values

In [19]:
#Qual o valor da similaridade de cossenos centralizada entre os usuários U1 e U3?
centered_cos_sim_U0U2 = centered_cosine_similarity(x, y)
print(f'A similaridade de cossenos centralizada entre U0 e U2 é {centered_cos_sim_U0U2:.2f}.')

A similaridade de cossenos centralizada entre U0 e U2 é -0.89.


In [20]:
x = ratings.loc['U0'].values
y = ratings.loc['U3'].values

In [21]:
#Qual o valor da similaridade de cossenos centralizada entre os usuários U1 e U4?
centered_cos_sim_U0U3 = centered_cosine_similarity(x, y)
print(f'A similaridade de cossenos centralizada entre U0 e U3 é {centered_cos_sim_U0U3:.2f}.')

A similaridade de cossenos centralizada entre U0 e U3 é 0.44.


In [22]:
x = ratings.loc['U0'].values
y = ratings.loc['U4'].values

In [23]:
#Qual o valor da similaridade de cossenos centralizada entre os usuários U1 e U5?
centered_cos_sim_U0U4 = centered_cosine_similarity(x, y)
print(f'A similaridade de cossenos centralizada entre U0 e U4 é {centered_cos_sim_U0U4:.2f}.')

A similaridade de cossenos centralizada entre U0 e U4 é -0.39.


In [None]:
#Considerando a similaridade de cossenos centralizada, qual o usuário é mais similar ao usuário U1? U2

In [None]:
#Considerando a similaridade de cossenos centralizada, qual o segundo usuário mais similar ao usuário U1? U3


In [None]:
#Considerando uma filtragem colaborativa User-User, com agregação pela média simples e número de vizinhos igual a 2, qual a predição para a avaliação do usuário U1 ao item I2?

In [24]:
def estimate_rating(ratings,
                    user_index=0,
                    item_index=0,
                    k=2,
                    similarity=centered_cosine_similarity,
                    aggregation='mean'):
  """
  Estima a avaliação de um par (usuario, item) pela filtragem colaborativa
  Item-Item com a vizinhança de tamanho k.
  """
  # vetor de avaliaçãoes do item
  item_rating = ratings.iloc[item_index].values

  # calcula a similaridade entre item_index e os demais indices
  items_indexes = np.array([i for i in range(ratings.shape[0]) if i!=item_index])
  similarities = np.array([similarity(x=item_rating, y=ratings.iloc[i].values) for i in items_indexes])

  # acha os k itens mais semelhantes e que possuem avaliação
  items_rated = np.where(ratings.iloc[items_indexes, user_index]>0)[0]
  sorted_similarities = np.argsort(-1*similarities[items_rated])
  k_closest = items_indexes[items_rated[sorted_similarities]][:k]

  # agregação
  ratings_k_closest = ratings.iloc[k_closest, user_index]
  if aggregation=='mean':
    prediction = np.mean(ratings_k_closest)
  elif aggregation=='wmean':
    similarities_k_closest = similarities[items_rated][sorted_similarities][:k]
    prediction = np.dot(np.abs(similarities_k_closest),
                        ratings_k_closest)/np.sum(np.abs(similarities_k_closest))
  else:
    raise ValueError(f'{aggregation} is an invalid value for aggregation!')

  # trata singularidades
  prediction = prediction if not np.isnan(prediction) else 3.

  # verbose
  print(f"Predição para user_index={user_index}, item_index={item_index}," \
        f"k={k}, aggregation={aggregation} é: {prediction:.2f}")
  return prediction


In [42]:
# previsão para usuario = U1, item =2, k=2, aggregation=mean
prediction = estimate_rating(ratings,
                             user_index=1,
                             item_index=0,
                             aggregation='mean')

Predição para user_index=1, item_index=0,k=2, aggregation=mean é: 3.75


In [41]:
prediction = estimate_rating(ratings,
                             user_index=0,
                             item_index=1,
                             aggregation='mean')

Predição para user_index=0, item_index=1,k=2, aggregation=mean é: 4.05


In [26]:
#Considerando uma filtragem colaborativa User-User, com agregação pela média simples
#e número de vizinhos igual a 2, qual a predição para a avaliação do usuário U1 ao item I1?
#Suponha que a avaliação (U1, I1) seja desconhecida.
prediction = estimate_rating(ratings,
                             user_index=0,
                             item_index=0,
                             aggregation='mean')


Predição para user_index=0, item_index=0,k=2, aggregation=mean é: 4.55


In [None]:
#Qual o erro absoluto (|real-previsto|) da predição da tupla (U1, I1), considerando uma
#filtragem colaborativa User-User, com agregação pela média simples e número de vizinhos igual a 2?

In [28]:
i,j = np.where(ratings > 0)

In [29]:
n=len(i)
p=.1
np.random.seed(25)
idx_test=np.random.choice(n, size=int(p*n))

In [30]:
ratings_train = ratings.copy()
ratings_train.iloc[i[idx_test], j[idx_test]]=0.

In [31]:
i[idx_test]

array([1])

In [32]:
j[idx_test]

array([1])

In [33]:
ratings

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6
U0,4.0,0.0,0.0,4.7,1.0,0.0,0.0
U1,5.0,4.5,4.0,0.0,0.0,0.0,0.0
U2,0.0,0.0,0.0,1.5,5.0,4.0,0.0
U3,4.1,3.0,0.0,4.9,0.0,0.0,3.0
U4,1.0,4.0,0.0,2.5,3.8,1.0,5.0


In [34]:
ratings_train

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6
U0,4.0,0.0,0.0,4.7,1.0,0.0,0.0
U1,5.0,0.0,4.0,0.0,0.0,0.0,0.0
U2,0.0,0.0,0.0,1.5,5.0,4.0,0.0
U3,4.1,3.0,0.0,4.9,0.0,0.0,3.0
U4,1.0,4.0,0.0,2.5,3.8,1.0,5.0


In [35]:
y_true = ratings.values[i[idx_test], j[idx_test]]

In [36]:
y_true

array([4.5])

In [37]:
y_pred=[]
for k in range(len(idx_test)):
  y_pred.append(estimate_rating(ratings_train,
                user_index=j[idx_test][k],
                item_index=i[idx_test][k],
                aggregation='mean'))

Predição para user_index=1, item_index=1,k=2, aggregation=mean é: 3.50


In [38]:
y_pred

[3.5]

In [39]:
rmse = np.sqrt(np.sum((y_true-y_pred)**2))
print(f"RMSE de teste FC agregando pela média simples: {rmse: .2f}")

RMSE de teste FC agregando pela média simples:  1.00


In [52]:
ratings = np.array([[4.0,3.59,0.0,4.7,1.0,0.0,0.0],
                   [5.0,4.5,4.0,0.0,0.0,0.0,0.0],
                   [0.0,0.0,0.0,1.5,5.0,4.0,0.0],
                   [4.1,3.0,0.0,4.9,0.0,0.0,3.0],
                   [1.0,4.0,0.0,2.5,3.8,1.0,5.0]])
columns = ['I'+str(i) for i in range(ratings.shape[1])]
index = ['U'+str(i) for i in range(ratings.shape[0])]

In [53]:
ratings = pd.DataFrame(data=ratings, columns=columns, index=index, dtype=float)

In [54]:
ratings

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6
U0,4.0,3.59,0.0,4.7,1.0,0.0,0.0
U1,5.0,4.5,4.0,0.0,0.0,0.0,0.0
U2,0.0,0.0,0.0,1.5,5.0,4.0,0.0
U3,4.1,3.0,0.0,4.9,0.0,0.0,3.0
U4,1.0,4.0,0.0,2.5,3.8,1.0,5.0


In [55]:
x = ratings.loc['U0'].values
y = ratings.loc['U1'].values

In [56]:
cos_sim_U0U1 = cosine_similarity(x,y)
print(f"A similaridade de cossenos entre U0 e U1 é {cos_sim_U0U1:.2f}.")

A similaridade de cossenos entre U0 e U1 é 0.64.


In [57]:
def estimate_rating(ratings,
                    user_index=0,
                    item_index=0,
                    k=2,
                    similarity=centered_cosine_similarity,
                    aggregation='mean'):
  """
  Estima a avaliação de um par (usuario, item) pela filtragem colaborativa
  Item-Item com a vizinhança de tamanho k.
  """
  # vetor de avaliaçãoes do item
  item_rating = ratings.iloc[item_index].values

  # calcula a similaridade entre item_index e os demais indices
  items_indexes = np.array([i for i in range(ratings.shape[0]) if i!=item_index])
  similarities = np.array([similarity(x=item_rating, y=ratings.iloc[i].values) for i in items_indexes])

  # acha os k itens mais semelhantes e que possuem avaliação
  items_rated = np.where(ratings.iloc[items_indexes, user_index]>0)[0]
  sorted_similarities = np.argsort(-1*similarities[items_rated])
  k_closest = items_indexes[items_rated[sorted_similarities]][:k]

  # agregação
  ratings_k_closest = ratings.iloc[k_closest, user_index]
  if aggregation=='mean':
    prediction = np.mean(ratings_k_closest)
  elif aggregation=='wmean':
    similarities_k_closest = similarities[items_rated][sorted_similarities][:k]
    prediction = np.dot(np.abs(similarities_k_closest),
                        ratings_k_closest)/np.sum(np.abs(similarities_k_closest))
  else:
    raise ValueError(f'{aggregation} is an invalid value for aggregation!')

  # trata singularidades
  prediction = prediction if not np.isnan(prediction) else 3.

  # verbose
  print(f"Predição para user_index={user_index}, item_index={item_index}," \
        f"k={k}, aggregation={aggregation} é: {prediction:.2f}")
  return prediction


In [59]:
prediction = estimate_rating(ratings,
                             user_index=0,
                             item_index=1,
                             aggregation='mean')

Predição para user_index=0, item_index=1,k=2, aggregation=mean é: 4.05
