In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import random

In [2]:
df = pd.read_csv("Books.csv", low_memory=False)
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [4]:
# Check for missing values in the dataframe
print(df.isnull().sum())

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64


In [5]:
# Remove null and duplicate values
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [6]:
# Check data types for column 3
print(df["Year-Of-Publication"].apply(type).value_counts())

Year-Of-Publication
<class 'str'>    271353
Name: count, dtype: int64


In [7]:
string_years = df[df["Year-Of-Publication"].apply(lambda x: isinstance(x, str))]

# Display only the 'Year-Of-Publication' column for string values
print(string_years["Year-Of-Publication"])

0         2002
1         2001
2         1991
3         1999
4         1999
          ... 
271355    1988
271356    1991
271357    2004
271358    1996
271359    2000
Name: Year-Of-Publication, Length: 271353, dtype: object


In [8]:
# Convert 'Year-Of-Publication' from string to int
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int)

In [9]:
print(df["Year-Of-Publication"].apply(type).value_counts())

Year-Of-Publication
<class 'int'>    271353
Name: count, dtype: int64


In [10]:
df = df.drop(['ISBN', 'Image-URL-M', 'Image-URL-L', 'Publisher'], axis=1)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271353 entries, 0 to 271359
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Book-Title           271353 non-null  object
 1   Book-Author          271353 non-null  object
 2   Year-Of-Publication  271353 non-null  int64 
 3   Image-URL-S          271353 non-null  object
dtypes: int64(1), object(3)
memory usage: 10.4+ MB


In [12]:
def clean_and_remove_duplicates(df):
    df['Cleaned-Title'] = df['Book-Title'].str.lower()
    df = df.drop_duplicates(subset='Cleaned-Title')
    return df

df = clean_and_remove_duplicates(df)

In [13]:
def clean_title(title):
    title = title.lower()
    title = ''.join(e for e in title if e.isalnum() or e.isspace())
    return title

In [14]:
df['Combined'] = df['Book-Title'] + ' ' + df['Book-Author']

In [15]:
def default_case(df):
    df['Combined'] = df['Combined'].str.lower()
    df['Combined'] = df['Combined'].str.replace(r'[^a-z0-9\s]', '', regex=True)
    return df

df = default_case(df)
df.head()

Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Image-URL-S,Cleaned-Title,Combined
0,Classical Mythology,Mark P. O. Morford,2002,http://images.amazon.com/images/P/0195153448.0...,classical mythology,classical mythology mark p o morford
1,Clara Callan,Richard Bruce Wright,2001,http://images.amazon.com/images/P/0002005018.0...,clara callan,clara callan richard bruce wright
2,Decision in Normandy,Carlo D'Este,1991,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,decision in normandy carlo deste
3,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,http://images.amazon.com/images/P/0374157065.0...,flu: the story of the great influenza pandemic...,flu the story of the great influenza pandemic ...
4,The Mummies of Urumchi,E. J. W. Barber,1999,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,the mummies of urumchi e j w barber


In [16]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Combined'])

In [17]:
def get_top_n_recommendations(title, n):
    title_cleaned = clean_title(title)
    
    matching_titles = df[df['Book-Title'].str.lower() == title_cleaned]
    
    if matching_titles.empty:
        raise ValueError(f"Title '{title}' not found in the dataset.")
    
    title_index = matching_titles.index[0]
    
    title_tfidf_vector = X[title_index]
    
    cosine_similarities = cosine_similarity(title_tfidf_vector, X).flatten()
    
    print(cosine_similarities)
    
    similar_indices = cosine_similarities.argsort()[-(n+2):-1][::-1]
    
    if 'Book-Title' not in df.columns or 'Book-Author' not in df.columns:
        raise KeyError("Required columns 'Book-Title' and 'Book-Author' are missing from the DataFrame.")
    
    # Fetch top n book titles and authors from similarity scores
    top_n_titles_authors = df[['Book-Title', 'Book-Author']].iloc[similar_indices]
    
    top_n_titles_authors = top_n_titles_authors.apply(lambda row: f"{row['Book-Title']} - {row['Book-Author']}", axis=1)
    
    top_n_titles_authors_list = top_n_titles_authors.tolist()
    
    top_n_titles_authors_list = list(set(top_n_titles_authors_list))[:n]
    
    return top_n_titles_authors_list


In [18]:
random_number = random.randint(3, 8)

get_top_n_recommendations('Classical Mythology', random_number)

[1. 0. 0. ... 0. 0. 0.]


["Crowell's Handbook of Classical Mythology (A Crowell reference book) - Edward Tripp",
 'Dictionary of Mythology: Mainly Classical - Bergen Evans',
 'Dictionary of mythology, mainly classical - Bergen Evans',
 'The Penguin Dictionary of Classical Mythology (Reference Books) - Pierre Grimal',
 "Mythology and You : Classical Mythology and its Relevance in Today's World - Donna Rosenberg",
 'Dracula (Classical Literature with Classical Music) - Brian Cox']

In [19]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [20]:
df.to_csv('Books_Data.csv', index=False)