In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
#las que voy a usar
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
books_filename = 'https://github.com/valemicolgarcia/TensorFlow/raw/main/Book%20Recomendation/book-crossings/BX-Books.csv'
ratings_filename = 'https://github.com/valemicolgarcia/TensorFlow/raw/main/Book%20Recomendation/book-crossings/BX-Book-Ratings.csv'

In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

----------------------------------------------------------------------------------------------

In [4]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


Union de los datasets

Tratamiento de nulos

In [6]:
df_books.isnull().sum()

isbn      0
title     0
author    1
dtype: int64

In [10]:
df_ratings.isnull().sum()

user      0
isbn      0
rating    0
dtype: int64

In [7]:
df_books.dropna(inplace=True)


In [8]:
df_books.isnull().sum()

isbn      0
title     0
author    0
dtype: int64

In [11]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [12]:
book_title = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
if book_title not in df_books['title'].values:
        print("Book title not found in the dataset.") 

Eliminacion de datos

In [13]:
user_counts = df_ratings['user'].value_counts() #crea una serie donde los indices son los id de los usuarios y los valores son el nro de veces que aparece cd usuario en la columna user

In [14]:

users_to_keep = user_counts[user_counts >= 200].index #se obtienen los id de los usuarios que tienen 200 o mas calificaciones (.index devuelve una lista)
df_filtered_users = df_ratings[df_ratings['user'].isin(users_to_keep)]  #se filtra df para que solo contenga las filas donde el valor de la columna user este en la lista de users_to_keep
df_filtered_users.head()

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0


sigue estando el libro buscado

In [28]:
books_counts = df_ratings['isbn'].value_counts()
books_to_keep = books_counts[books_counts >= 100].index
df_filtered = df_filtered_users[df_filtered_users['isbn'].isin(books_to_keep)]
#df_filtered = df_filtered_users.copy()

In [29]:
df_filtered.head()

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0


In [30]:
#df_reset = df_filtered.reset_index(drop=True)


Encodeo el autor para poder usarlo en el entrenamiento

Preparo las variables que voy a usar para el entrenamiento

In [40]:
df = df_filtered.pivot_table(index=['user'],columns=['isbn'],values='rating').fillna(0).T
print(df.values)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [41]:
df.head()
df.shape

(731, 888)

In [33]:
df.index = df.join(df_books.set_index('isbn'))['title']
#df_books.set_index('isbn) configura la columna isbn como el indice de df_books
#se unen los dos datsets
#se asigna el title como indice

Entrenamiento con la matriz escalada

In [34]:
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

In [35]:
def get_recommends(book_title = ""):

    book = df.loc[book_title]

    # selecciono la fila de la matriz de caracteristicas escaladas correspondiente al indice del libro buscado y cambio su forma para que sea compatible con kneighbors
    #book_features = X_scaled[book_index].reshape(1, -1)

    # busco los libros mas cercanos
    distances, indices = model.kneighbors([book.values], n_neighbors=6)

    # preparo la lista de listas
    recommended_books = pd.DataFrame({
      'title'   : df.iloc[indices[0]].index.values,
      'distance': distances[0]
    }) \
    .sort_values(by='distance', ascending=False) \
    .head(5).values
    
    lista = [book_title,recommended_books]
   
    return lista

In [36]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

["Where the Heart Is (Oprah's Book Club (Paperback))", array([["I'll Be Seeing You", 0.8016210794448853],
       ['The Weight of Water', 0.7708583474159241],
       ['The Surgeon', 0.7699410915374756],
       ['I Know This Much Is True', 0.7677075266838074],
       ['The Lovely Bones: A Novel', 0.7234864234924316]], dtype=object)]


In [37]:
# The Queen of the Damned (Vampire Chronicles (Paperback))
books = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
print(books)

['The Queen of the Damned (Vampire Chronicles (Paperback))', array([['Catch 22', 0.793983519077301],
       ['The Witching Hour (Lives of the Mayfair Witches)',
        0.7448656558990479],
       ['Interview with the Vampire', 0.7345068454742432],
       ['The Tale of the Body Thief (Vampire Chronicles (Paperback))',
        0.5376338362693787],
       ['The Vampire Lestat (Vampire Chronicles, Book II)',
        0.5178411602973938]], dtype=object)]


------------------------------------------------------------------------------------------------------

In [38]:
def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

You passed the challenge! 🎉🎉🎉🎉🎉
