In [56]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import random

In [57]:
df = pd.read_csv("Books.csv")
df.head()

  df = pd.read_csv("Books.csv")


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [59]:
# Check for missing values in the dataframe
print(df.isnull().sum())

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


In [60]:
# Remove null and duplicate values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [61]:
# Check data types for column 3
print(df["Year-Of-Publication"].apply(type).value_counts())

Year-Of-Publication
<class 'int'>    205820
<class 'str'>     65533
Name: count, dtype: int64


In [62]:
string_years = df[df["Year-Of-Publication"].apply(lambda x: isinstance(x, str))]

# Display only the 'Year-Of-Publication' column for string values
print(string_years["Year-Of-Publication"])

196608    2000
196609    1995
196610    1999
196611    2004
196612    2003
          ... 
262139    1986
262140    1987
262141    1994
262142    1993
262143    1991
Name: Year-Of-Publication, Length: 65533, dtype: object


In [63]:
# Convert 'Year-Of-Publication' from string to int
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int)

In [64]:
print(df["Year-Of-Publication"].apply(type).value_counts())

Year-Of-Publication
<class 'int'>    271353
Name: count, dtype: int64


In [65]:
df = df.drop(['ISBN', 'Image-URL-M', 'Image-URL-L', 'Publisher'], axis=1)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271353 entries, 0 to 271359
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Book-Title           271353 non-null  object
 1   Book-Author          271353 non-null  object
 2   Year-Of-Publication  271353 non-null  int64 
 3   Image-URL-S          271353 non-null  object
dtypes: int64(1), object(3)
memory usage: 10.4+ MB


In [67]:
# Clean the titles to lowercase and remove duplicates
def clean_and_remove_duplicates(df):
    # Lowercase the titles in the dataset
    df['Cleaned-Title'] = df['Book-Title'].str.lower()
    
    # Remove duplicate rows based on the cleaned title
    df = df.drop_duplicates(subset='Cleaned-Title')
    
    return df

# Apply this function to clean the dataset
df = clean_and_remove_duplicates(df)

In [68]:
def clean_title(title):
    # Lowercase and remove non-alphanumeric characters (keeping spaces)
    title = title.lower()
    title = ''.join(e for e in title if e.isalnum() or e.isspace())
    return title

In [69]:
# Combine Book-Title and Book-Author
df['Combined'] = df['Book-Title'] + ' ' + df['Book-Author']

In [70]:
def default_case(df):
    df['Combined'] = df['Combined'].str.lower()
    # Remove non-alphanumeric characters (keeping spaces)
    df['Combined'] = df['Combined'].str.replace(r'[^a-z0-9\s]', '', regex=True)
    return df

df = default_case(df)
df.head()

Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Image-URL-S,Cleaned-Title,Combined
0,Classical Mythology,Mark P. O. Morford,2002,http://images.amazon.com/images/P/0195153448.0...,classical mythology,classical mythology mark p o morford
1,Clara Callan,Richard Bruce Wright,2001,http://images.amazon.com/images/P/0002005018.0...,clara callan,clara callan richard bruce wright
2,Decision in Normandy,Carlo D'Este,1991,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,decision in normandy carlo deste
3,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,http://images.amazon.com/images/P/0374157065.0...,flu: the story of the great influenza pandemic...,flu the story of the great influenza pandemic ...
4,The Mummies of Urumchi,E. J. W. Barber,1999,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,the mummies of urumchi e j w barber


In [71]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Combined'])

In [72]:
def get_top_n_recommendations(title, n):
    title_cleaned = clean_title(title)
    
    matching_titles = df[df['Book-Title'].str.lower() == title_cleaned]
    
    if matching_titles.empty:
        raise ValueError(f"Title '{title}' not found in the dataset.")
    
    # Get the index of the first matching title
    title_index = matching_titles.index[0]
    
    # Get the TF-IDF vector for the given title
    title_tfidf_vector = X[title_index]
    
    # Calculate cosine similarity between the title vector and all other book vectors
    cosine_similarities = cosine_similarity(title_tfidf_vector, X).flatten()
    
    # Get indices of the top N most similar books 
    similar_indices = cosine_similarities.argsort()[-(n+2):-1][::-1]
    
    if 'Book-Title' not in df.columns or 'Book-Author' not in df.columns:
        raise KeyError("Required columns 'Book-Title' and 'Book-Author' are missing from the DataFrame.")
    
    # Fetch the top N book titles and authors based on similarity scores
    top_n_titles_authors = df[['Book-Title', 'Book-Author']].iloc[similar_indices]
    
    # Format them as "Title - Author" and ensure it's a pandas Series before converting to list
    top_n_titles_authors = top_n_titles_authors.apply(lambda row: f"{row['Book-Title']} - {row['Book-Author']}", axis=1)
    
    # Convert the Series to a list
    top_n_titles_authors_list = top_n_titles_authors.tolist()
    
    # Remove duplicates and ensure we return exactly 'n' results
    top_n_titles_authors_list = list(set(top_n_titles_authors_list))[:n]
    
    return top_n_titles_authors_list


In [73]:
random_number = random.randint(3, 8)

get_top_n_recommendations('Classical Mythology', random_number)

['The Penguin Dictionary of Classical Mythology (Reference Books) - Pierre Grimal',
 "Mythology and You : Classical Mythology and its Relevance in Today's World - Donna Rosenberg",
 "Crowell's Handbook of Classical Mythology (A Crowell reference book) - Edward Tripp",
 'Dictionary of Mythology: Mainly Classical - Bergen Evans',
 'Dracula (Classical Literature with Classical Music) - Brian Cox',
 'The Dictionary of Classical Mythology - JOHN EDWARD ZIMMERMAN']