In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2023-07-25 19:26:06--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.2’


2023-07-25 19:26:09 (8.40 MB/s) - ‘book-crossings.zip.2’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
print(df_books.shape)
print(df_ratings.shape)

(271379, 3)
(1149780, 3)


In [5]:
user_review_counts = df_ratings['user'].value_counts()

book_review_counts = df_ratings['isbn'].value_counts()

filtered_ratings = df_ratings[
    (df_ratings['user'].isin(user_review_counts[user_review_counts > 100].index)) &
    (df_ratings['isbn'].isin(book_review_counts[book_review_counts > 10].index))
]

final_data = filtered_ratings.merge(df_books, on='isbn')

In [7]:
final_data

Unnamed: 0,user,isbn,rating,title,author
0,276925,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,10030,002542730X,7.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
...,...,...,...,...,...
273476,252695,0140445285,0.0,Crime and Punishment (Penguin Classics),Fyodor Dostoyevsky
273477,264903,3789129402,10.0,"Ronja, RÃ?Â¤ubertochter. ( Ab 10 J.).",Astrid Lindgren
273478,268030,0141182679,0.0,On the Road (Penguin Modern Classics),Jack Kerouac
273479,276018,3498044761,0.0,Der TÃ?Â¤nzer.,Colum McCann


In [8]:
titles=final_data['title']
tr=titles.drop_duplicates()
tr

0         Politically Correct Bedtime Stories: Modern Ta...
102           Sushi for Beginners : A Novel (Keyes, Marian)
118               Wasted : A Memoir of Anorexia and Bulimia
130                               La casa de los espÃ­ritus
138                                     The Music of Chance
                                ...                        
273474    Bias : A CBS Insider Exposes How the Media Dis...
273477                Ronja, RÃ?Â¤ubertochter. ( Ab 10 J.).
273478                On the Road (Penguin Modern Classics)
273479                                       Der TÃ?Â¤nzer.
273480                                            Mondlaub.
Name: title, Length: 13629, dtype: object

In [9]:
table=final_data.pivot_table(index='title',columns='user',values='rating')
table.fillna(0,inplace=True)
table

user,254,507,882,1424,1435,1733,1903,2033,2110,2276,...,276463,276538,276680,276925,277427,277478,277639,278137,278188,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback)),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Q-Space (Star Trek The Next Generation, Book 47)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Q-Zone (Star Trek The Next Generation, Book 48)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
!Yo!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iI Paradiso Degli Orchi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
one hundred years of solitude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stardust,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
table.shape

(28681, 1822)

In [10]:
table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28681 entries,  Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth to Â¿QuÃ© me quieres, amor?
Columns: 1822 entries, 183 to 278418
dtypes: float32(1822)
memory usage: 199.6+ MB


In [10]:
import sqlite3


In [11]:
conn = sqlite3.connect('my_database.db')
tr.to_sql('table_name', conn, if_exists='replace', index=False, dtype={'title': 'TEXT'})
conn.close()

In [12]:
def get_db_connection():
    conn = sqlite3.connect('my_database.db')
    return conn

def get_book_titles():
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT title FROM table_name')
    titles = [row[0] for row in cursor.fetchall()]
    conn.close()
    return titles

titles2 = get_book_titles()

In [85]:
from fuzzywuzzy import fuzz, process




In [13]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='auto', metric='cosine')
nbrs.fit(table)

similarities, indices = nbrs.kneighbors(table)

closeness =1- similarities

print(closeness)
print(indices)

[[1.         0.76450956 0.71736455 0.68552434 0.64358056 0.63303417]
 [0.9999999  0.6154574  0.6154574  0.6154574  0.6154574  0.57143784]
 [1.         1.         1.         0.80582297 0.7432942  0.6902685 ]
 ...
 [0.99999994 0.5763904  0.5763904  0.5763904  0.5763904  0.5763904 ]
 [0.99999994 0.45145118 0.42161417 0.42062485 0.41401422 0.39517713]
 [1.         0.7624929  0.7624929  0.5674536  0.50210506 0.48074287]]
[[    0 13029  8497  6629  4070  6670]
 [    1   802  6666 11644 13065  2482]
 [ 7929 13521     2  4232  7263  9227]
 ...
 [13626 12895 11658  5890   511  8617]
 [13627  6430  8046  2541  5714  5235]
 [13628  8698  2975  2604  6253  5338]]


In [14]:
def get_recommends(book = ""):
  ind=np.where(table.index==book)[0][0]
  temp=indices[ind]
  rc=[]
  for i in range(len(temp)):

    b=table.iloc[temp[i]].name
    # temp_df=df_books[df_books['title']==b]
    # temp_df.drop_duplicates('title')['title']
    s=closeness[ind][i]
    t2=[b]
    rc.append(b)
  recommended_books=[rc]


  return recommended_books

In [15]:
books = get_recommends("Anna Karenina")
print(books)

[['Anna Karenina', "Cleveland Amory's Compleat Cat: The Cat Who Came for Christmas : The Cat and the Curmudgeon, the Best Cat Ever", 'Cosmos', 'The Red Badge of Courage', 'Tunnel Vision (V.I. Warshawski Novels (Hardcover))', 'The Lost Continent: Travels in Small Town America']]


In [16]:
import pickle
pickle.dump(table,open('table.pkl','wb'))
pickle.dump(indices,open('indices.pkl','wb'))
pickle.dump(closeness,open('closeness.pkl','wb'))

In [18]:
booksdb = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author','y','p','img2','img'],
    usecols=['isbn', 'title', 'author','y','p','img2','img'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str','y':'str','p':'str','img2':'str','img':'str'})

In [19]:
pickle.dump(booksdb,open('booksdb.pkl','wb'))

In [46]:
pickle.dump(titles2,open('titles.pkl','wb'))

In [None]:
booksdb