# Step 1: Load and Explore the Data

In [1]:
import pandas as pd

# URL of the dataset
url = "https://gist.githubusercontent.com/jaidevd/23aef12e9bf56c618c41/raw/c05e98672b8d52fa0cb94aad80f75eb78342e5d4/books_new.csv"

# Read the data from the URL
books_df = pd.read_csv(url)

# Save the dataframe locally as a CSV file
books_df.to_csv('books.csv', index=False)

print("Data has been ingested and saved locally as 'books.csv'")


Data has been ingested and saved locally as 'books_local.csv'


In [2]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     187 non-null    object
 2   Genre      211 non-null    object
 3   SubGenre   211 non-null    object
 4   Height     211 non-null    int64 
 5   Publisher  115 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.0+ KB


In [3]:
books_df.sample(5)

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
152,"Information, The","Gleick, James",science,mathematics,233,
64,Crime and Punishment,"Dostoevsky, Fyodor",fiction,classic,180,Penguin
117,One,"Bach, Richard",nonfiction,autobiography,172,Dell
210,"Christmas Carol, A","Dickens, Charles",fiction,classic,196,
81,Manasa,"Kale, V P",nonfiction,misc,213,Mauj


# Step 2: Data Preprocessing

In [4]:
# Fill missing values
books_df.fillna('', inplace=True)

# Combine relevant features into a single string
books_df['combined_features'] = books_df.apply(lambda row: ' '.join([row['Title'], row['Author'], row['Genre'], row['SubGenre']]), axis=1)

# Display the combined features
print(books_df['combined_features'].head())


0    Fundamentals of Wavelets Goswami, Jaideva tech...
1           Data Smart Foreman, John tech data_science
2    God Created the Integers Hawking, Stephen tech...
3    Superfreakonomics Dubner, Stephen science econ...
4          Orientalism Said, Edward nonfiction history
Name: combined_features, dtype: object


# Step 3: Content-Based Filtering using TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert the text data into TF-IDF feature vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(books_df['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)



# Step 4: Saving the Model

In [7]:
import joblib
# Save the vectorizer and cosine similarity matrix
joblib.dump(vectorizer, 'model/vectorizer.pkl')
joblib.dump(cosine_sim, 'model/cosine_sim.pkl')


['model/cosine_sim.pkl']

# Step 5 : loading the models and getting recommendations

In [10]:
import joblib
import pandas as pd

# Load the vectorizer and cosine similarity matrix
vectorizer = joblib.load('model/vectorizer.pkl')
cosine_sim = joblib.load('model/cosine_sim.pkl')

# Load your dataset
df = pd.read_csv('books.csv')

# Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df[df['Title'] == title].index[0]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['Title'].iloc[book_indices]

# Example usage
print(get_recommendations('Fundamentals of Wavelets'))


58                                Learning OpenCV
154                Elements of Information Theory
207                  Image Processing with MATLAB
61            Principles of Communication Systems
60             Computer Vision, A Modern Approach
8      Image Processing & Mathematical Morphology
57                           Econometric Analysis
62                                       Let Us C
172                                 Pointers in C
21                                Analysis, Vol I
Name: Title, dtype: object
