In [1]:
cd /content/drive/MyDrive/AAA_MLIP

/content/drive/MyDrive/AAA_MLIP


# Import necessary libraries

In [13]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [11]:
df=pd.read_csv('books_1.csv')

In [12]:
df.head(1)

Unnamed: 0,Title,Author,Year,Genres,Summary,cover_image
0,The Plague,Albert Camus,1947,"Existentialism, Fiction, Absurdist fiction, Novel",The text of The Plague is divided into five p...,http://books.google.com/books/content?id=KVGd-...


# Functions

In [19]:
def initialize_tfidf_vectorizer(corpus):
    """
    Initialize the TF-IDF Vectorizer and fit it to the corpus.

    Parameters:
        corpus (list): A list of text documents.

    Returns:
        TfidfVectorizer: The fitted TF-IDF Vectorizer.
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    return tfidf_vectorizer, tfidf_matrix


In [23]:
def find_similar_books(user_input, tfidf_vectorizer, tfidf_matrix, df, top_n=5):
    """
    Find the top N similar books based on the user input and TF-IDF matrix.

    Parameters:
        user_input (str): The user's query.
        tfidf_vectorizer (TfidfVectorizer): The fitted TF-IDF Vectorizer.
        tfidf_matrix (sparse matrix): The TF-IDF matrix.
        df (DataFrame): The DataFrame containing book details.
        top_n (int): The number of similar books to return. Default is 5.

    Returns:
        DataFrame: The top N similar books.
    """
    # Transform the user input to match the learned TF-IDF vocabulary
    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    # Calculate cosine similarity between the user input and all book summaries
    cosine_similarities = cosine_similarity(user_input_tfidf, tfidf_matrix).flatten()

    # Find the indices of the top N similarity scores
    top_indices = np.argsort(cosine_similarities)[-top_n:][::-1]

    # Fetch the details of the top N similar books
    similar_books = df.iloc[top_indices]

    return similar_books[['Title', 'Author', 'Year', 'cover_image','Summary']]

# Main

In [24]:
# Example usage:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer, tfidf_matrix = initialize_tfidf_vectorizer(df['Summary'])

# Example user input
user_input = "A story about the impact of war on soldiers, highlighting their struggles."

# Find and display the top 5 similar books
similar_books = find_similar_books(user_input, tfidf_vectorizer, tfidf_matrix, df)
similar_books

Unnamed: 0,Title,Author,Year,cover_image,Summary
1064,The March,E. L. Doctorow,2005,http://books.google.com/books/content?id=bKC5i...,"Published in 2005 by E.L. Doctorow, The March..."
279,Diary of an Ordinary Woman,Margaret Forster,2003-03-06,http://books.google.com/books/content?id=TKZaA...,"From the age of thirteen, on the eve of the G..."
1,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,http://books.google.com/books/content?id=pgvqs...,"The book tells the story of Paul Bäumer, a Ge..."
1060,Waiting for the Barbarians,John Maxwell Coetzee,1980-10-27,http://books.google.com/books/content?id=x09pD...,The story is set in a small frontier town und...
243,Lies and the Lying Liars Who Tell Them,Al Franken,2003,http://books.google.com/books/content?id=xqFPE...,Lies and the Lying Liars Who Tell Them largel...


In [5]:
def convert_genres_to_one_hot(df, genre_column='Genres'):
    """
    Convert the specified genre column into one-hot encoding and update the DataFrame.

    Parameters:
        df (DataFrame): The DataFrame containing the genre column.
        genre_column (str): The name of the genre column to convert. Default is 'Genres'.

    Returns:
        DataFrame: The DataFrame with the genre column converted into one-hot encoding.
    """
    # Split genres into separate columns using one-hot encoding
    genres_one_hot = df[genre_column].str.get_dummies(sep=', ')

    # Concatenate the one-hot encoded genres with the original DataFrame
    df = pd.concat([df, genres_one_hot], axis=1)

    # Drop the original 'Genres' column
    df.drop(genre_column, axis=1, inplace=True)

    return df

In [6]:
df=convert_genres_to_one_hot(df)

In [10]:
len(df.columns)

153