In [30]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
#las que voy a usar
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [31]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

#books_filename = 'BX-Books.csv'
#ratings_filename = 'BX-Book-Ratings.csv'

books_filename = 'https://github.com/valemicolgarcia/TensorFlow/raw/main/Book%20Recomendation/book-crossings/BX-Books.csv'
ratings_filename = 'https://github.com/valemicolgarcia/TensorFlow/raw/main/Book%20Recomendation/book-crossings/BX-Book-Ratings.csv'

In [32]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [33]:
# add your code here - consider creating a new cell for each section of code

-------------------------------------------------------------------------------------------------------------------------------------------------------

# Cleaning the data

In [34]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [35]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


### Checking nan

In [36]:
df_ratings.isnull().sum()

user      0
isbn      0
rating    0
dtype: int64

In [37]:
df_books.isnull().sum()

isbn      0
title     0
author    1
dtype: int64

In [38]:
df_books.dropna(inplace=True)

In [39]:
df_books.isnull().sum()

isbn      0
title     0
author    0
dtype: int64

### Duplicates

In [40]:
num_duplicados = df_books.duplicated().sum()
num_duplicados


0

In [41]:
num_duplicados = df_ratings.duplicated().sum()
num_duplicados

0

--------------------------------------------------------------------------------------------------------------------------------------------------

### Deleting some data

In [42]:
#cuento la cantidad de ratings por usuario
user_counts = df_ratings['user'].value_counts()
print(user_counts)

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user, Length: 105283, dtype: int64


In [43]:
#devuelve los indices de los elementos que tienen mas de 200 ratings
users_to_keep = user_counts[user_counts >= 200].index
print(users_to_keep)

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
             28634,  59727, 268622, 188951, 225595,  83671, 252827,  99955,
             36554,  26883],
           dtype='int64', length=905)


In [44]:
#mantengo solo los usuarios con mas de 200 ratings
df_filtered_ratings = df_ratings[df_ratings['user'].isin(users_to_keep)]


In [45]:
books_counts = df_filtered_ratings['isbn'].value_counts()

In [46]:
#users_to_keep = user_counts[user_counts >= 200].index
books_to_keep = books_counts[books_counts >= 100].index

In [47]:
df_filtered_books = df_books[df_books['isbn'].isin(books_to_keep)]


In [48]:
df_filtered_books.head()

Unnamed: 0,isbn,title,author
18,440234743,The Testament,John Grisham
26,971880107,Wild Animus,Rich Shapero
37,446310786,To Kill a Mockingbird,Harper Lee
52,440225701,The Street Lawyer,JOHN GRISHAM
67,804106304,The Joy Luck Club,Amy Tan


In [49]:
df_filtered_ratings.head()

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1457,277427,0026217457,0.0
1458,277427,003008685X,8.0
1459,277427,0030615321,0.0
1460,277427,0060002050,0.0


In [50]:
df_filtered_ratings.index = df_filtered_ratings.join(df_filtered_books.set_index('isbn'))['title']

In [51]:
df_filtered_ratings.tail()

Unnamed: 0_level_0,user,isbn,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,275970,3829021860,0.0
,275970,4770019572,0.0
,275970,896086097,0.0
,275970,9626340762,8.0
,275970,9626344990,0.0


In [52]:
X_train = df_filtered_ratings.drop('isbn', axis=1)

Y_train = df_filtered_ratings['isbn']

In [53]:
X_train.head()

Unnamed: 0_level_0,user,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
,277427,10.0
,277427,0.0
,277427,8.0
,277427,0.0
,277427,0.0


### KNN

In [54]:
from sklearn.neighbors import NearestNeighbors

In [55]:
neigh = NearestNeighbors(n_neighbors=5, algorithm='auto')
neigh.fit(X_train)

In [56]:
#distances, indices = neigh.kneighbors(new data) --> asi busco los 5 vecinos mas cercanos de un nuevo punto

-------------------------------------------------------------------------------------

In [58]:
def get_recommends(book = ""):
    
    #busco el isbn del libro en el dataset df_books
    book_row = df_books[df_books['title'] == book] #me traigo la fila
    if book_row.empty: 
        return f"El libro '{book}' no se encontró en el dataset."
    
    isbn = book_row['isbn'].values[0] #busco el isbn
  
    #encuentro los 5 libros mas cercanos
    neigh = NearestNeighbors(n_neighbors=5, algorithm='auto')
    neigh.fit(X_train)



  return recommended_books

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 16)

--------------------------------------------------------------------------------------

# MODELO

# FUNCION RECOMENDATION

In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):

  isbn = df_books[df_books['title'] == book ]['isbn'].values #encuentro el isbn del titulo del libro en df_books
  if len(isbn) == 0:
        return "Book title not found."
  
  book_isbn = isbn[0] 
  book_index = df_books[df_books['isbn'] == book_isbn].index[0] #encuentro el indice del libro en la matriz
  
  
  #if book_index >= len(user_book_matrix_scaled):
   #     return "ISBN not found in the dataset."
  
  distances, indices = model_knn.kneighbors(user_book_matrix_scaled[book_index:book_index+1], n_neighbors=6)

  print("Distances:", distances)
  print("Indices:", indices)

  recommended_indices = indices[0] #indices de los libros recomendados
  recommended_distances = distances[0]

  recommended_books = []
  for idx, distance in zip(recommended_indices, recommended_distances):
      # Asegúrate de que el índice sea válido
      if 0 <= idx < len(df_books):
          title = df_books.iloc[idx]['title']
          recommended_books.append([title, distance])

  result = [book,recommended_books]


  return result

In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

Distances: [[1. 1. 1. 1. 1. 1.]]
Indices: [[227039 227040 227037 227038 227035 227041]]
["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Colour : A Novel', 1.0], ['Swedenborg: Life and Teaching', 1.0], ['FranzÃ?Â¶sische Gedichte / Poemes francais.', 1.0], ['Alice at 80', 1.0], ['The Path: The Inner Life of Jesus Christ', 1.0], ["Woman's Book of Soul", 1.0]]]


In [None]:
books = get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")
print(books)

Distances: [[1. 1. 1. 1. 1. 1.]]
Indices: [[227039 227040 227037 227038 227035 227041]]
['The Queen of the Damned (Vampire Chronicles (Paperback))', [['The Colour : A Novel', 1.0], ['Swedenborg: Life and Teaching', 1.0], ['FranzÃ?Â¶sische Gedichte / Poemes francais.', 1.0], ['Alice at 80', 1.0], ['The Path: The Inner Life of Jesus Christ', 1.0], ["Woman's Book of Soul", 1.0]]]


In [None]:


def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

You haven't passed yet. Keep trying!
