<a href="https://colab.research.google.com/github/virendrasinh734/book_recommendation/blob/main/recommend_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [None]:
df_books = pd.read_csv(
    'BX-Books.csv',
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    'BX-Book-Ratings.csv',
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [None]:
print(df_books.shape)
print(df_ratings.shape)

In [None]:
user_review_counts = df_ratings['user'].value_counts()

book_review_counts = df_ratings['isbn'].value_counts()

user_review_counts = df_ratings['user'].value_counts()
popular_users = user_review_counts[user_review_counts > 100].index
book_review_counts = df_ratings['isbn'].value_counts()
popular_books = book_review_counts[book_review_counts > 10].index

filtered_ratings = df_ratings[
    (df_ratings['user'].isin(popular_users)) &
    (df_ratings['isbn'].isin(popular_books))
]
final_data = filtered_ratings.merge(df_books, on='isbn')
final_data


In [None]:
titles=final_data['title']
tr=titles.drop_duplicates()
tr

In [None]:
table=final_data.pivot_table(index='title',columns='user',values='rating')
table.fillna(0,inplace=True)
table

In [None]:
book_titles = table.index.tolist()
print(len(book_titles))
isbn_list = []
title_to_isbn={}
for title in book_titles:
    matching_isbns = df_books[df_books['title'] == title]['isbn'].tolist()
    title_to_isbn[title]=matching_isbns
    isbn_list.append(matching_isbns)

print(len(isbn_list))
print(len(list(title_to_isbn.keys())))
# with open("isbn_names.txt","w") as file:
#     for item in isbn_list:
#         file.write(f"{item}\n")

In [None]:
import json

with open('title_to_isbn_mapping.json', 'w') as json_file:
    json.dump(title_to_isbn, json_file)


In [None]:
table.shape
table.info()

In [None]:
import sqlite3

# Connect to the SQLite database (create a new database if it doesn't exist)
conn = sqlite3.connect('my_database.db')

# Create a new table 'titles_table' in the database with a column 'title'
tr.to_sql('titles', conn, if_exists='replace', index=False, dtype={'title': 'TEXT'})

# Close the connection
conn.close()

In [None]:
def get_db_connection():
    conn = sqlite3.connect('my_database.db')
    return conn

def get_book_titles():
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT title FROM table_name')
    titles = [row[0] for row in cursor.fetchall()]
    conn.close()
    return titles
titles2 = get_book_titles()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


cosine_sim_matrix = cosine_similarity(table, table)
book_A_index = 2330  
similarities_to_book_A = cosine_sim_matrix[book_A_index]
sorted_indices = similarities_to_book_A.argsort()[::-1]  
top_similar_books = [book for book in sorted_indices if book != book_A_index][:5]  # Exclude book A and select the top 5 similar books

print(top_similar_books)
print(cosine_sim_matrix.shape)


In [None]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import json

data_dir = "./ddeess"
with open("ind_to_book.json", "r", encoding="utf-8") as json_file:
    ind_to_book = json.load(json_file)

def preprocess_text(text):
    tokens = text.lower().split()
    return tokens

documents = []
file_paths = [os.path.join(data_dir, f"{i}.txt") for i in range(13629)]

for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        description = file.read()
        tokens = preprocess_text(description)  
        tag = os.path.basename(file_path).split(".")[0]
        print(tag)
        book_title = ind_to_book[tag]
        # print(book_title)
        tokens.append(book_title)
        document = TaggedDocument(words=tokens, tags=[tag])
        documents.append(document)

model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=30)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

book_vectors = [model.dv[f"{i}"] for i in range(13629)]

cosine_mat = cosine_similarity(book_vectors, book_vectors)

print(cosine_mat.shape)


In [None]:
import numpy as np

cosine_sim_matrix_user = np.array(cosine_sim_matrix)
cosine_sim_matrix_description = np.array(cosine_mat)

weight_user_reviews = 0.7
weight_book_description = 0.3

combined_cosine_sim_matrix = (weight_user_reviews * cosine_sim_matrix_user) + (weight_book_description * cosine_sim_matrix_description)


In [None]:
import pickle
with open('combined_cosine_sim_matrix.pkl', 'wb') as file:
    pickle.dump(combined_cosine_sim_matrix, file)


In [None]:
# import pickle
# pickle.dump(table,open('./pickle_files/table.pkl','wb'))
# pickle.dump(cosine_sim_matrix,open('./pickle_files/cosine_sim.pkl','wb'))
# def get_recommends(book):
#   ind=np.where(table.index==book)[0][0]
#   temp=indices[ind]
#   rc=[]
#   for i in range(len(temp)):

#     b=table.iloc[temp[i]].name
#     # temp_df=df_books[df_books['title']==b]
#     # temp_df.drop_duplicates('title')['title']
#     s=closeness[ind][i]
#     t2=[b]
#     rc.append(b)
#   recommended_books=[rc]
#   return recommended_books
# books = get_recommends("Anna Karenina")
# print(books)
# import pickle
# pickle.dump(table,open('./pickle_files/table.pkl','wb'))
# pickle.dump(indices,open('./pickle_files/indices.pkl','wb'))
# pickle.dump(closeness,open('./pickle_files/closeness.pkl','wb'))
# booksdb = pd.read_csv(
#     books_filename,
#     encoding = "ISO-8859-1",
#     sep=";",
#     header=0,
#     names=['isbn', 'title', 'author','y','p','img2','img'],
#     usecols=['isbn', 'title', 'author','y','p','img2','img'],
#     dtype={'isbn': 'str', 'title': 'str', 'author': 'str','y':'str','p':'str','img2':'str','img':'str'})
# # pickle.dump(booksdb,open('booksdb.pkl','wb'))
# # pickle.dump(titles2,open('titles.pkl','wb'))
# # booksdb

In [None]:
# import requests
# from bs4 import BeautifulSoup
# import os
# import time


# output_directory = "filess"

# if not os.path.exists(output_directory):
#     os.makedirs(output_directory)

# def fetch_book_description(isbn):
#     url = f"https://openlibrary.org/isbn/{isbn}"

#     response = requests.get(url)

#     # Check if the request was successful (status code 200)
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')

#         description_element = soup.find('div', {'class': 'book-description'})

#         if description_element:
#             description = description_element.get_text(strip=True)
#             return description

#     return None
# def descextract(isbn_list):
#     for isbn in isbn_list:
#         description = fetch_book_description(isbn)
#         if description:
#             filename = f"{isbn}.txt"
#             with open(os.path.join(output_directory, filename), "w", encoding="utf-8") as file:
#                 file.write(description)
#                 print(f"Saved description for ISBN {isbn} to {filename}")
#         else:
#             print(f"Description not found for ISBN {isbn}")
#         time.sleep(0.7)

#     print("Done")
