Problem Statement:

"Build a book recommendation system that suggests books based on user preferences, such as favorite authors, genres, or book content. The system aims to help readers discover books with similar themes, styles, or tones, making it easier to find books they'll enjoy."

                 (or)

Problem Statement:

"Develop a content-based book recommendation system leveraging metadata such as author, genre, description, and user interactions. The system aims to compute similarities between books to suggest personalized recommendations that align with user preferences, facilitating the discovery of books with matching themes, styles, and tones."








In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [1]:
import os
import pandas as pd

# Define the path to your main directory
folder = "/content/drive/MyDrive/RECOMENDATION_SYSTEM_FILES"

# Create an empty list to store DataFrames
dataframes = []

# Walk through all subdirectories and files
for root, dirs, files in os.walk(folder):
    for filename in files:
        # Check if the file is a CSV file
        if filename.endswith(".csv"):
            filepath = os.path.join(root, filename)
            print(f"Reading file: {filepath}")

            # Read the CSV file and append it to the list
            df = pd.read_csv(filepath)
            dataframes.append(df)

# After the loop, concatenate all dataframes into one final dataframe
if dataframes:
    final_dataframe = pd.concat(dataframes, ignore_index=True)
    print("Data concatenated successfully!")
else:
    print("No CSV files found.")

# save the final dataframe to a new CSV file
final_dataframe.to_csv("/content/drive/MyDrive/final_dataframe_Recommendation_system.csv", index=False)


Reading file: /content/drive/MyDrive/RECOMENDATION_SYSTEM_FILES/book1100k-1200k.csv/book1100k-1200k.csv.csv
Reading file: /content/drive/MyDrive/RECOMENDATION_SYSTEM_FILES/book1200k-1300k.csv/book1200k-1300k.csv.csv
Reading file: /content/drive/MyDrive/RECOMENDATION_SYSTEM_FILES/book1000k-1100k.csv/book1000k-1100k.csv.csv
Data concatenated successfully!


In [3]:
final_dataframe.head(3)

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,Count of text reviews
0,1100003,The Prince,Niccolò Machiavelli,226500438,3.82,1998,1,9,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,,151,"The most famous book on politics ever written,...",11
1,1100004,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,822334836,4.29,2005,2,3,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,,384,Many of America’s greatest Protestant preacher...,1
2,1100007,The Last Sorcerer,Ethan Russo,789012707,4.0,2001,1,1,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,,368,,0


In [4]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125219 entries, 0 to 125218
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Id                     125219 non-null  int64  
 1   Name                   125219 non-null  object 
 2   Authors                125219 non-null  object 
 3   ISBN                   124857 non-null  object 
 4   Rating                 125219 non-null  float64
 5   PublishYear            125219 non-null  int64  
 6   PublishMonth           125219 non-null  int64  
 7   PublishDay             125219 non-null  int64  
 8   Publisher              124088 non-null  object 
 9   RatingDist5            125219 non-null  object 
 10  RatingDist4            125219 non-null  object 
 11  RatingDist3            125219 non-null  object 
 12  RatingDist2            125219 non-null  object 
 13  RatingDist1            125219 non-null  object 
 14  RatingDistTotal        125219 non-nu

In [5]:
final_dataframe.describe()

Unnamed: 0,Id,Rating,PublishYear,PublishMonth,PublishDay,CountsOfReview,pagesNumber,Count of text reviews
count,125219.0,125219.0,125219.0,125219.0,125219.0,125219.0,125219.0,125219.0
mean,1153445.0,3.401355,1998.823174,9.731239,6.334718,6.486715,266.847994,6.486715
std,86744.25,1.314769,52.011681,10.337081,3.58842,145.948794,301.112726,145.948794
min,1000000.0,0.0,208.0,1.0,1.0,0.0,0.0,0.0
25%,1078134.0,3.39,1995.0,1.0,3.0,0.0,144.0,0.0
50%,1156435.0,3.81,2001.0,4.0,6.0,1.0,240.0,1.0
75%,1228080.0,4.08,2005.0,18.0,9.0,3.0,336.0,3.0
max,1299999.0,5.0,20067.0,31.0,12.0,43652.0,51294.0,43652.0


In [6]:
#droping duplicates
final_dataframe.drop_duplicates() #No dupliactes

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,Count of text reviews
0,1100003,The Prince,Niccolò Machiavelli,0226500438,3.82,1998,1,9,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,,151,"The most famous book on politics ever written,...",11
1,1100004,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,0822334836,4.29,2005,2,3,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,,384,Many of America’s greatest Protestant preacher...,1
2,1100007,The Last Sorcerer,Ethan Russo,0789012707,4.00,2001,1,1,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,,368,,0
3,1100009,The Idea of a University,John Henry Newman,0300064055,4.12,1996,29,5,Yale University Press,5:286,4:207,3:96,2:38,1:12,total:639,6,,400,"Since its publication almost 150 years ago, <i...",6
4,1100010,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,0801857961,3.00,1997,18,11,Johns Hopkins University Press,5:0,4:1,3:2,2:1,1:0,total:4,0,,622,"Most religious traditions have a rich, if larg...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,1099974,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,0394910095,4.39,2007,23,1,Random House Books for Young Readers,5:745,4:201,3:145,2:39,1:23,total:1153,0,,144,A silly book with a serious purpose—to help ch...,0
125215,1099979,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,0375922768,4.19,2004,25,5,Random House Books for Young Readers,5:153,4:107,3:59,2:13,1:2,total:334,3,,48,The Cat and company travel by hot air balloon ...,3
125216,1099987,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,189020644X,5.00,2008,1,1,Mapin Publishing Pvt,5:3,4:0,3:0,2:0,1:0,total:3,0,,356,Situated on the famous Crescent Site near the ...,0
125217,1099989,Goodnight Moon,Margaret Wise Brown,0064430170,4.28,2007,23,1,HarperCollins,5:167387,4:67858,3:42052,2:11835,1:6265,total:295397,40,,32,<strong>In this classic of children's literatu...,40


# Data Mining

In [7]:
# Based on my problem statement removing useless features in dataframe .
#Id
#ISBN
# Language(language feature is >80% missing data so we directly removed. )

# Drop the columns 'Id', 'ISBN', and 'Language' from the dataframe
final_dataframe = final_dataframe.drop(['Id', 'ISBN', 'Language'], axis=1)

# Display the first few rows of the updated dataframe to verify the changes
display(final_dataframe.head())

# Optionally, display the remaining columns to ensure the specified columns are removed
display(final_dataframe.columns)


Unnamed: 0,Name,Authors,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews
0,The Prince,Niccolò Machiavelli,3.82,1998,1,9,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,151,"The most famous book on politics ever written,...",11
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,2005,2,3,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,384,Many of America’s greatest Protestant preacher...,1
2,The Last Sorcerer,Ethan Russo,4.0,2001,1,1,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,368,,0
3,The Idea of a University,John Henry Newman,4.12,1996,29,5,Yale University Press,5:286,4:207,3:96,2:38,1:12,total:639,6,400,"Since its publication almost 150 years ago, <i...",6
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.0,1997,18,11,Johns Hopkins University Press,5:0,4:1,3:2,2:1,1:0,total:4,0,622,"Most religious traditions have a rich, if larg...",0


Index(['Name', 'Authors', 'Rating', 'PublishYear', 'PublishMonth',
       'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4', 'RatingDist3',
       'RatingDist2', 'RatingDist1', 'RatingDistTotal', 'CountsOfReview',
       'pagesNumber', 'Description', 'Count of text reviews'],
      dtype='object')

In [8]:
print(final_dataframe.columns)


Index(['Name', 'Authors', 'Rating', 'PublishYear', 'PublishMonth',
       'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4', 'RatingDist3',
       'RatingDist2', 'RatingDist1', 'RatingDistTotal', 'CountsOfReview',
       'pagesNumber', 'Description', 'Count of text reviews'],
      dtype='object')


In [9]:
# # Combine into a single column as a datetime object
# final_dataframe['PublishDate'] = pd.to_datetime(final_dataframe[['PublishYear', 'PublishMonth', 'PublishDay']])

# # Drop the original columns if needed
# final_dataframe = final_dataframe.drop(['PublishYear', 'PublishMonth', 'PublishDay'], axis=1)

# # Display the resulting dataframe
# print(final_dataframe)

In [10]:
final_dataframe.PublishMonth.unique()

array([ 1,  2, 29, 18, 13, 15, 20, 12, 17, 22, 28,  7, 14, 24, 26,  9,  4,
        6,  5,  8, 21, 27, 10, 31, 23, 16, 30,  3, 11, 25, 19])

In [11]:
final_dataframe.PublishYear.unique()

array([ 1998,  2005,  2001,  1996,  1997,  2007,  1995,  2004,  2002,
        2003,  1994,  1964,  1988,  1990,  1992,  1962,  2010,  2008,
        1993,  1999,  1986,  1989,  2006,  1979,  2000,  1991,  2014,
        1984,  1982,  1981,  1987,  1973,  2012,  1975,  1977,  2013,
        2016,  1976,  2011,  1985,  1980,  1957,  1959,  1978,  1972,
        1983,  1971,  1965,  1966,  2017,  1974,  1969,  2009,  1950,
        2015,  1970,  1958,  1968,  1953,  2019,  1956,  2018,  1948,
        1960,  1928,  1951,  1914,  1925,  1955,  1944,  1963,  1967,
        1900,  1961,  1923,  1940,  2021,  1916,  1927,  1952,  1941,
        1905,  1949,   208,  1932,  1922,  1939,  1931,  1945,  1929,
        1943,  2020,  2030,  1899,  1936,  1942,  1901,  1954,  1947,
        1920,  1911,  1924, 20067,  1915,  1938,  1898,  1935,  1921,
        1930,  1946,  1937,  1907])

In [12]:
final_dataframe["PublishYear"]=final_dataframe["PublishYear"].replace({208:2008,20067:2006})

In [13]:
final_dataframe.PublishYear.unique()

array([1998, 2005, 2001, 1996, 1997, 2007, 1995, 2004, 2002, 2003, 1994,
       1964, 1988, 1990, 1992, 1962, 2010, 2008, 1993, 1999, 1986, 1989,
       2006, 1979, 2000, 1991, 2014, 1984, 1982, 1981, 1987, 1973, 2012,
       1975, 1977, 2013, 2016, 1976, 2011, 1985, 1980, 1957, 1959, 1978,
       1972, 1983, 1971, 1965, 1966, 2017, 1974, 1969, 2009, 1950, 2015,
       1970, 1958, 1968, 1953, 2019, 1956, 2018, 1948, 1960, 1928, 1951,
       1914, 1925, 1955, 1944, 1963, 1967, 1900, 1961, 1923, 1940, 2021,
       1916, 1927, 1952, 1941, 1905, 1949, 1932, 1922, 1939, 1931, 1945,
       1929, 1943, 2020, 2030, 1899, 1936, 1942, 1901, 1954, 1947, 1920,
       1911, 1924, 1915, 1938, 1898, 1935, 1921, 1930, 1946, 1937, 1907])

In [14]:
# Manually correct month values if needed (e.g., 13 becomes 3, 18 becomes 8, etc.)
final_dataframe['PublishMonth'] = final_dataframe['PublishMonth'].replace({
    13: 3, 14: 4, 15: 5, 16: 6, 17: 7, 18: 8, 19: 9, 20: 10,
    21: 11, 22: 12, 23: 12, 24: 12, 25: 12, 26: 12, 27: 12,
    28: 12, 29: 12, 30: 12, 31: 12
})

In [15]:
final_dataframe.PublishMonth.unique()

array([ 1,  2, 12,  8,  3,  5, 10,  7,  4,  9,  6, 11])

In [16]:
final_dataframe.PublishDay.unique()

array([ 9,  3,  1,  5, 11, 12,  7,  6,  4, 10,  8,  2])

In [17]:
import pandas as pd
import numpy as np

# Function to check for leap year
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Function to handle invalid days in leap years and month-day mismatch
def correct_publish_date(row):
    year = row['PublishYear']
    month = row['PublishMonth']
    day = row['PublishDay']

    # Handle invalid February 29 in non-leap years
    if month == 2 and day == 29 and not is_leap_year(year):
        return pd.NaT  # Invalid date, returning NaT (Not a Time)

    # Handle days greater than valid days for each month
    if month == 2 and day > 29:
        return pd.NaT
    elif month in [4, 6, 9, 11] and day > 30:
        return pd.NaT
    elif month in [1, 3, 5, 7, 8, 10, 12] and day > 31:
        return pd.NaT
    return pd.to_datetime(f"{year}-{month}-{day}", errors='coerce')  # Return valid date

# Apply the correction function to each row in the dataframe
final_dataframe['CorrectedPublishDate'] = final_dataframe.apply(correct_publish_date, axis=1)

# Optionally, drop rows with NaT values in the 'CorrectedPublishDate' column
final_dataframe = final_dataframe.dropna(subset=['CorrectedPublishDate'])

# Display the updated dataframe
print(final_dataframe[['PublishYear', 'PublishMonth', 'PublishDay', 'CorrectedPublishDate']].head())

# Optionally, save the updated dataframe to a new CSV file
# final_dataframe.to_csv('corrected_books_data.csv', index=False)


   PublishYear  PublishMonth  PublishDay CorrectedPublishDate
0         1998             1           9           1998-01-09
1         2005             2           3           2005-02-03
2         2001             1           1           2001-01-01
3         1996            12           5           1996-12-05
4         1997             8          11           1997-08-11


In [18]:
final_dataframe

Unnamed: 0,Name,Authors,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate
0,The Prince,Niccolò Machiavelli,3.82,1998,1,9,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,151,"The most famous book on politics ever written,...",11,1998-01-09
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,2005,2,3,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,384,Many of America’s greatest Protestant preacher...,1,2005-02-03
2,The Last Sorcerer,Ethan Russo,4.00,2001,1,1,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,368,,0,2001-01-01
3,The Idea of a University,John Henry Newman,4.12,1996,12,5,Yale University Press,5:286,4:207,3:96,2:38,1:12,total:639,6,400,"Since its publication almost 150 years ago, <i...",6,1996-12-05
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.00,1997,8,11,Johns Hopkins University Press,5:0,4:1,3:2,2:1,1:0,total:4,0,622,"Most religious traditions have a rich, if larg...",0,1997-08-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,4.39,2007,12,1,Random House Books for Young Readers,5:745,4:201,3:145,2:39,1:23,total:1153,0,144,A silly book with a serious purpose—to help ch...,0,2007-12-01
125215,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,4.19,2004,12,5,Random House Books for Young Readers,5:153,4:107,3:59,2:13,1:2,total:334,3,48,The Cat and company travel by hot air balloon ...,3,2004-12-05
125216,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,5.00,2008,1,1,Mapin Publishing Pvt,5:3,4:0,3:0,2:0,1:0,total:3,0,356,Situated on the famous Crescent Site near the ...,0,2008-01-01
125217,Goodnight Moon,Margaret Wise Brown,4.28,2007,12,1,HarperCollins,5:167387,4:67858,3:42052,2:11835,1:6265,total:295397,40,32,<strong>In this classic of children's literatu...,40,2007-12-01


In [19]:
# Drop the 'PublishYear', 'PublishMonth', and 'PublishDay' columns
final_dataframe = final_dataframe.drop(['PublishYear', 'PublishMonth', 'PublishDay'], axis=1)

# Display the updated dataframe
display(final_dataframe.head())


Unnamed: 0,Name,Authors,Rating,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate
0,The Prince,Niccolò Machiavelli,3.82,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,151,"The most famous book on politics ever written,...",11,1998-01-09
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,384,Many of America’s greatest Protestant preacher...,1,2005-02-03
2,The Last Sorcerer,Ethan Russo,4.0,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,368,,0,2001-01-01
3,The Idea of a University,John Henry Newman,4.12,Yale University Press,5:286,4:207,3:96,2:38,1:12,total:639,6,400,"Since its publication almost 150 years ago, <i...",6,1996-12-05
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.0,Johns Hopkins University Press,5:0,4:1,3:2,2:1,1:0,total:4,0,622,"Most religious traditions have a rich, if larg...",0,1997-08-11


In [20]:
final_dataframe["Description"].isnull().sum()

17204

In [21]:
final_dataframe.fillna("Unknown", inplace=True)

In [22]:
final_dataframe["Description"].isnull().sum()

0

In [23]:
final_dataframe

Unnamed: 0,Name,Authors,Rating,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate
0,The Prince,Niccolò Machiavelli,3.82,University of Chicago Press,5:72775,4:87000,3:65783,2:17926,1:5717,total:249201,11,151,"The most famous book on politics ever written,...",11,1998-01-09
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,Duke University Press Books,5:5,4:8,3:1,2:0,1:0,total:14,1,384,Many of America’s greatest Protestant preacher...,1,2005-02-03
2,The Last Sorcerer,Ethan Russo,4.00,Haworth Integrative Healing Press,5:1,4:0,3:1,2:0,1:0,total:2,0,368,Unknown,0,2001-01-01
3,The Idea of a University,John Henry Newman,4.12,Yale University Press,5:286,4:207,3:96,2:38,1:12,total:639,6,400,"Since its publication almost 150 years ago, <i...",6,1996-12-05
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.00,Johns Hopkins University Press,5:0,4:1,3:2,2:1,1:0,total:4,0,622,"Most religious traditions have a rich, if larg...",0,1997-08-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,4.39,Random House Books for Young Readers,5:745,4:201,3:145,2:39,1:23,total:1153,0,144,A silly book with a serious purpose—to help ch...,0,2007-12-01
125215,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,4.19,Random House Books for Young Readers,5:153,4:107,3:59,2:13,1:2,total:334,3,48,The Cat and company travel by hot air balloon ...,3,2004-12-05
125216,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,5.00,Mapin Publishing Pvt,5:3,4:0,3:0,2:0,1:0,total:3,0,356,Situated on the famous Crescent Site near the ...,0,2008-01-01
125217,Goodnight Moon,Margaret Wise Brown,4.28,HarperCollins,5:167387,4:67858,3:42052,2:11835,1:6265,total:295397,40,32,<strong>In this classic of children's literatu...,40,2007-12-01


In [24]:
# List of rating columns
rating_columns = ['Rating', 'RatingDist5', 'RatingDist4', 'RatingDist3', 'RatingDist2', 'RatingDist1']

# Create a new dataframe with only the rating columns
ratings_data = final_dataframe[rating_columns]

# Display the separated ratings data
print("Ratings data:")
display(ratings_data)

Ratings data:


Unnamed: 0,Rating,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1
0,3.82,5:72775,4:87000,3:65783,2:17926,1:5717
1,4.29,5:5,4:8,3:1,2:0,1:0
2,4.00,5:1,4:0,3:1,2:0,1:0
3,4.12,5:286,4:207,3:96,2:38,1:12
4,3.00,5:0,4:1,3:2,2:1,1:0
...,...,...,...,...,...,...
125214,4.39,5:745,4:201,3:145,2:39,1:23
125215,4.19,5:153,4:107,3:59,2:13,1:2
125216,5.00,5:3,4:0,3:0,2:0,1:0
125217,4.28,5:167387,4:67858,3:42052,2:11835,1:6265


In [25]:
# List of columns to process
rating_columns = ['RatingDist5', 'RatingDist4', 'RatingDist3', 'RatingDist2', 'RatingDist1']

# Replace ':' with '.' and convert to float
for col in rating_columns:
    final_dataframe[col] = final_dataframe[col].str.replace(':', '.', regex=False)  # Replace ':' with '.'
    final_dataframe[col] = pd.to_numeric(final_dataframe[col], errors='coerce')  # Convert to float

# Check the updated data types
print(final_dataframe[rating_columns].dtypes)

# Display the cleaned columns
print(final_dataframe[rating_columns].head())


RatingDist5    float64
RatingDist4    float64
RatingDist3    float64
RatingDist2    float64
RatingDist1    float64
dtype: object
   RatingDist5  RatingDist4  RatingDist3  RatingDist2  RatingDist1
0      5.72775        4.870      3.65783      2.17926       1.5717
1      5.50000        4.800      3.10000      2.00000       1.0000
2      5.10000        4.000      3.10000      2.00000       1.0000
3      5.28600        4.207      3.96000      2.38000       1.1200
4      5.00000        4.100      3.20000      2.10000       1.0000


In [26]:
# Step 1: Replace "total:" with an empty string
final_dataframe['RatingDistTotal'] = final_dataframe['RatingDistTotal'].str.replace('total:', '', regex=False)

# Step 2: Convert the column to numeric (using errors='coerce' in case of any invalid entries)
final_dataframe['RatingDistTotal'] = pd.to_numeric(final_dataframe['RatingDistTotal'], errors='coerce')


In [27]:
print(final_dataframe['RatingDistTotal'].dtypes)

int64


In [28]:
final_dataframe

Unnamed: 0,Name,Authors,Rating,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate
0,The Prince,Niccolò Machiavelli,3.82,University of Chicago Press,5.727750,4.87000,3.65783,2.17926,1.5717,249201,11,151,"The most famous book on politics ever written,...",11,1998-01-09
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,Duke University Press Books,5.500000,4.80000,3.10000,2.00000,1.0000,14,1,384,Many of America’s greatest Protestant preacher...,1,2005-02-03
2,The Last Sorcerer,Ethan Russo,4.00,Haworth Integrative Healing Press,5.100000,4.00000,3.10000,2.00000,1.0000,2,0,368,Unknown,0,2001-01-01
3,The Idea of a University,John Henry Newman,4.12,Yale University Press,5.286000,4.20700,3.96000,2.38000,1.1200,639,6,400,"Since its publication almost 150 years ago, <i...",6,1996-12-05
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.00,Johns Hopkins University Press,5.000000,4.10000,3.20000,2.10000,1.0000,4,0,622,"Most religious traditions have a rich, if larg...",0,1997-08-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,4.39,Random House Books for Young Readers,5.745000,4.20100,3.14500,2.39000,1.2300,1153,0,144,A silly book with a serious purpose—to help ch...,0,2007-12-01
125215,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,4.19,Random House Books for Young Readers,5.153000,4.10700,3.59000,2.13000,1.2000,334,3,48,The Cat and company travel by hot air balloon ...,3,2004-12-05
125216,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,5.00,Mapin Publishing Pvt,5.300000,4.00000,3.00000,2.00000,1.0000,3,0,356,Situated on the famous Crescent Site near the ...,0,2008-01-01
125217,Goodnight Moon,Margaret Wise Brown,4.28,HarperCollins,5.167387,4.67858,3.42052,2.11835,1.6265,295397,40,32,<strong>In this classic of children's literatu...,40,2007-12-01


In [29]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125219 entries, 0 to 125218
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Name                   125219 non-null  object        
 1   Authors                125219 non-null  object        
 2   Rating                 125219 non-null  float64       
 3   Publisher              125219 non-null  object        
 4   RatingDist5            125219 non-null  float64       
 5   RatingDist4            125219 non-null  float64       
 6   RatingDist3            125219 non-null  float64       
 7   RatingDist2            125219 non-null  float64       
 8   RatingDist1            125219 non-null  float64       
 9   RatingDistTotal        125219 non-null  int64         
 10  CountsOfReview         125219 non-null  int64         
 11  pagesNumber            125219 non-null  int64         
 12  Description            125219 non-null  obje

In [30]:
final_dataframe["Description"]

Unnamed: 0,Description
0,"The most famous book on politics ever written,..."
1,Many of America’s greatest Protestant preacher...
2,Unknown
3,"Since its publication almost 150 years ago, <i..."
4,"Most religious traditions have a rich, if larg..."
...,...
125214,A silly book with a serious purpose—to help ch...
125215,The Cat and company travel by hot air balloon ...
125216,Situated on the famous Crescent Site near the ...
125217,<strong>In this classic of children's literatu...


In [31]:
import regex as re
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
from nltk.stem import SnowballStemmer
engSnowballStemmer=SnowballStemmer('english')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()
#dir(WordNetLemmatizer)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Convert Uniform Case

In [33]:
def convertUniform(x):
  return x.apply(lambda x: x.casefold())

In [34]:
final_dataframe["Description"]=convertUniform(final_dataframe["Description"])
final_dataframe["Description"]

Unnamed: 0,Description
0,"the most famous book on politics ever written,..."
1,many of america’s greatest protestant preacher...
2,unknown
3,"since its publication almost 150 years ago, <i..."
4,"most religious traditions have a rich, if larg..."
...,...
125214,a silly book with a serious purpose—to help ch...
125215,the cat and company travel by hot air balloon ...
125216,situated on the famous crescent site near the ...
125217,<strong>in this classic of children's literatu...


# remove HTML Tags

In [35]:
def removeHtmlTags(x):
  return x.apply(lambda x : re.sub(r'<.*?>','',x))

In [36]:
final_dataframe["Description"]=removeHtmlTags(final_dataframe["Description"])
final_dataframe["Description"]

Unnamed: 0,Description
0,"the most famous book on politics ever written,..."
1,many of america’s greatest protestant preacher...
2,unknown
3,"since its publication almost 150 years ago, th..."
4,"most religious traditions have a rich, if larg..."
...,...
125214,a silly book with a serious purpose—to help ch...
125215,the cat and company travel by hot air balloon ...
125216,situated on the famous crescent site near the ...
125217,"in this classic of children's literature, belo..."


# handling spe chars

In [37]:
def handleSpeChars(x):
  return  x.apply(lambda x: re.sub(r'[^0-9a-zA-z ]','',x))

In [38]:
final_dataframe['Description']=handleSpeChars(final_dataframe['Description'])
final_dataframe['Description']

Unnamed: 0,Description
0,the most famous book on politics ever written ...
1,many of americas greatest protestant preachers...
2,unknown
3,since its publication almost 150 years ago the...
4,most religious traditions have a rich if large...
...,...
125214,a silly book with a serious purposeto help chi...
125215,the cat and company travel by hot air balloon ...
125216,situated on the famous crescent site near the ...
125217,in this classic of childrens literature belove...


# removing stop words

In [39]:
def removeStopWords(x):
  return x.apply(lambda x: ' '.join([ele for ele in x.split() if ele not in stopwords]))

In [40]:
final_dataframe['Description']=removeStopWords(final_dataframe['Description'])
final_dataframe['Description']

Unnamed: 0,Description
0,famous book politics ever written prince remai...
1,many americas greatest protestant preacherspau...
2,unknown
3,since publication almost 150 years ago idea un...
4,religious traditions rich largely forgotten he...
...,...
125214,silly book serious purposeto help children rec...
125215,cat company travel hot air balloon various wea...
125216,situated famous crescent site near gateway ind...
125217,classic childrens literature beloved generatio...


# stemming

In [41]:
def applystemming(x):
  return x.apply(lambda x: ' '.join([engSnowballStemmer.stem(ele) for ele in x.split()]))

In [42]:
final_dataframe['Description']=applystemming(final_dataframe['Description'])
final_dataframe['Description']

Unnamed: 0,Description
0,famous book polit ever written princ remain li...
1,mani america greatest protest preacherspaul ti...
2,unknown
3,sinc public almost 150 year ago idea univers e...
4,religi tradit rich larg forgotten heritag invo...
...,...
125214,silli book serious purposeto help children rec...
125215,cat compani travel hot air balloon various wea...
125216,situat famous crescent site near gateway india...
125217,classic children literatur belov generat reade...


# lemmatization

In [43]:
def applylemma(x):
  return x.apply(lambda x: ' '.join([lemma.lemmatize(ele) for ele in x.split()]))

In [44]:
final_dataframe['Description']=applylemma(final_dataframe['Description'])
final_dataframe['Description']

Unnamed: 0,Description
0,famous book polit ever written princ remain li...
1,mani america greatest protest preacherspaul ti...
2,unknown
3,sinc public almost 150 year ago idea univers e...
4,religi tradit rich larg forgotten heritag invo...
...,...
125214,silli book serious purposeto help child recogn...
125215,cat compani travel hot air balloon various wea...
125216,situat famous crescent site near gateway india...
125217,classic child literatur belov generat reader l...


In [62]:
genre_keywords = {
    # Popular Fiction Genres
    'Thriller': ['thrilling', 'suspenseful', 'detective', 'mystery', 'crime', 'spy', 'psychological', 'intrigue', 'chilling', 'tense'],
    'Romance': ['romantic', 'love', 'heartbreak', 'relationship', 'affair', 'passion', 'wedding', 'valentine', 'desire', 'soulmate'],
    'Fantasy': ['fantasy', 'magic', 'wizard', 'dragon', 'myth', 'fairy', 'castle', 'sorcery', 'realm', 'unicorn', 'enchantment'],
    'Science Fiction (Sci-Fi)': ['scifi', 'dystopian', 'space', 'alien', 'robot', 'technology', 'cyberpunk', 'AI', 'galaxy', 'future', 'time travel'],
    'Horror': ['horror', 'ghost', 'paranormal', 'haunted', 'scary', 'zombie', 'vampire', 'evil', 'nightmare', 'creepy', 'fear', 'demonic'],

    # Non-Fiction Genres
    'Self-help': ['selfhelp', 'guide', 'improve', 'productivity', 'success', 'mindset', 'habits', 'wellness', 'motivation', 'happiness'],
    'Biography': ['biography', 'memoir', 'autobiography', 'life story', 'inspiring', 'personal journey', 'legacy', 'heroic', 'true story'],
    'Non-fiction': ['nonfiction', 'true', 'factual', 'documentary', 'educational', 'analysis', 'research', 'report', 'history'],
    'Health & Fitness': ['fitness', 'nutrition', 'diet', 'exercise', 'wellness', 'yoga', 'mental health', 'meditation', 'workout', 'healthy living'],
    'Business': ['business', 'finance', 'economics', 'startups', 'money', 'entrepreneurship', 'corporate', 'investing', 'marketing', 'strategy'],
    'Politics': ['politics', 'government', 'policy', 'democracy', 'leadership', 'elections', 'activism', 'campaign', 'law', 'reform'],
    'Religion & Spirituality': ['religion', 'faith', 'spirituality', 'prayer', 'bible', 'god', 'temple', 'holy', 'belief', 'soul'],
    'Science': ['science', 'biology', 'physics', 'chemistry', 'experiment', 'research', 'innovation', 'theory', 'discovery', 'space exploration'],

    # Creative and Arts Genres
    'Poetry': ['poetry', 'poem', 'verse', 'lyric', 'stanza', 'sonnet', 'haiku', 'rhyme', 'metaphor', 'ode'],
    'Art': ['art', 'painting', 'sculpture', 'gallery', 'aesthetic', 'visual', 'design', 'craft', 'creativity'],
    'Music': ['music', 'melody', 'concert', 'symphony', 'band', 'instrument', 'song', 'artist', 'notes', 'opera'],
    'Cookbooks': ['cookbook', 'recipe', 'cooking', 'kitchen', 'culinary', 'baking', 'food', 'dining', 'dessert', 'meals'],

    # Children's and Young Adult
    'Children': ['kids', 'storybook', 'bedtime', 'nursery', 'fairy tale', 'adventure', 'animal', 'imagination', 'learning', 'playful', 'magic'],
    'Young Adult': ['young', 'teen', 'adolescence', 'high school', 'coming-of-age', 'teenage', 'first love', 'youth'],

    # Adventure and Travel
    'Adventure': ['adventure', 'quest', 'journey', 'expedition', 'trek', 'voyage', 'exploration', 'mission', 'heroic', 'discovery'],
    'Travel': ['travel', 'destination', 'journey', 'tourism', 'wanderlust', 'vacation', 'explore', 'guidebook', 'trek'],

    # Additional Specialized Genres
    'Historical Fiction': ['historical', 'timeline', 'past', 'vintage', 'war', 'ancient', 'empire', 'revolution', 'medieval', 'renaissance'],
    'Graphic Novels': ['graphic', 'comic', 'illustrated', 'manga', 'cartoon', 'superhero', 'visual', 'art', 'novel'],
    'Drama': ['drama', 'emotional', 'intense', 'character-driven', 'realistic', 'melodrama', 'tragic', 'conflict', 'relationships'],
    'Comedy/Humor': ['comedy', 'humor', 'funny', 'hilarious', 'satire', 'jokes', 'parody', 'laugh', 'witty'],
    'Crime': ['crime', 'detective', 'murder', 'forensic', 'serial killer', 'investigation', 'justice', 'law', 'criminal'],
    'Psychology': ['psychology', 'mind', 'behavior', 'cognitive', 'therapy', 'emotions', 'mental health', 'subconscious', 'self-awareness'],
    'Classic': ['classic', 'timeless', 'literature', 'masterpiece', 'canon', 'iconic', 'vintage', 'immortal', 'renowned'],
    'LGBTQ+': ['lgbtq', 'queer', 'gay', 'lesbian', 'bisexual', 'transgender', 'identity', 'pride', 'love'],

    # Niche Genres
    'Sports': ['sports', 'athletics', 'soccer', 'football', 'basketball', 'competition', 'training', 'team', 'game', 'coach'],
    'Military': ['military', 'war', 'army', 'soldier', 'battle', 'combat', 'strategy', 'airforce', 'navy', 'history'],
    'Environmental': ['environment', 'nature', 'sustainability', 'wildlife', 'climate', 'ecology', 'conservation', 'planet', 'earth'],
    'Legal': ['legal', 'law', 'justice', 'trial', 'courtroom', 'lawyer', 'verdict', 'case', 'lawsuit'],
    'Technology': ['technology', 'innovation', 'computing', 'robotics', 'AI', 'data', 'future', 'digital', 'cyber', 'engineering'],
    'True Crime': ['true crime', 'criminal', 'real crime', 'justice', 'murder', 'investigation', 'forensics', 'detective'],
    'Medical': ['medical', 'healthcare', 'hospital', 'doctor', 'disease', 'treatment', 'surgery', 'medicine', 'nurse']
}




#  Add Genre column based on the cleaned description
def classify_genre(cleaned_text):
    for genre, keywords in genre_keywords.items():
        if any(keyword in cleaned_text for keyword in keywords):
            return genre
    return 'Unknown'

final_dataframe['Genre'] = final_dataframe['Description'].apply(classify_genre)

# Display the DataFrame
display(final_dataframe)

Unnamed: 0,Name,Authors,Rating,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate,DaysSincePublished,Genre
0,The Prince,Niccolò Machiavelli,3.82,University of Chicago Press,5.727750,4.87000,3.65783,2.17926,1.5717,249201,11,151,famous book polit ever written princ remain li...,11,1998-01-09,9880,Politics
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,Duke University Press Books,5.500000,4.80000,3.10000,2.00000,1.0000,14,1,384,mani america greatest protest preacherspaul ti...,1,2005-02-03,7298,Romance
2,The Last Sorcerer,Ethan Russo,4.00,Haworth Integrative Healing Press,5.100000,4.00000,3.10000,2.00000,1.0000,2,0,368,unknown,0,2001-01-01,8792,Unknown
3,The Idea of a University,John Henry Newman,4.12,Yale University Press,5.286000,4.20700,3.96000,2.38000,1.1200,639,6,400,sinc public almost 150 year ago idea univers e...,6,1996-12-05,10280,Non-fiction
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.00,Johns Hopkins University Press,5.000000,4.10000,3.20000,2.10000,1.0000,4,0,622,religi tradit rich larg forgotten heritag invo...,0,1997-08-11,10031,Religion & Spirituality
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,4.39,Random House Books for Young Readers,5.745000,4.20100,3.14500,2.39000,1.2300,1153,0,144,silli book serious purposeto help child recogn...,0,2007-12-01,6267,Unknown
125215,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,4.19,Random House Books for Young Readers,5.153000,4.10700,3.59000,2.13000,1.2000,334,3,48,cat compani travel hot air balloon various wea...,3,2004-12-05,7358,Travel
125216,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,5.00,Mapin Publishing Pvt,5.300000,4.00000,3.00000,2.00000,1.0000,3,0,356,situat famous crescent site near gateway india...,0,2008-01-01,6236,Unknown
125217,Goodnight Moon,Margaret Wise Brown,4.28,HarperCollins,5.167387,4.67858,3.42052,2.11835,1.6265,295397,40,32,classic child literatur belov generat reader l...,40,2007-12-01,6267,Classic


In [63]:
final_dataframe.Genre.unique()

array(['Politics', 'Romance', 'Unknown', 'Non-fiction',
       'Religion & Spirituality', 'Biography', 'Historical Fiction',
       'Science Fiction (Sci-Fi)', 'Poetry', 'Art', 'Fantasy', 'Thriller',
       'Music', 'Young Adult', 'Classic', 'Horror', 'Health & Fitness',
       'Adventure', 'Environmental', 'Business', 'Self-help', 'LGBTQ+',
       'Children', 'Cookbooks', 'Graphic Novels', 'Legal', 'Technology',
       'Comedy/Humor', 'Crime', 'Science', 'Sports', 'Psychology',
       'Drama', 'Medical', 'Travel', 'Military'], dtype=object)

In [65]:
final_dataframe.Genre.value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Unknown,31507
Romance,19335
Art,15173
Non-fiction,7703
Poetry,6973
Self-help,5174
Fantasy,4805
Religion & Spirituality,4134
Politics,3558
Horror,3494


# CLEANED AFTER FINAL DATAFRAME

In [83]:
final_dataframe

Unnamed: 0,Name,Authors,Rating,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,pagesNumber,Description,Count of text reviews,CorrectedPublishDate,DaysSincePublished,Genre
0,The Prince,Niccolò Machiavelli,3.82,University of Chicago Press,5.727750,4.87000,3.65783,2.17926,1.5717,249201,11,151,famous book polit ever written princ remain li...,11,1998-01-09,9880,Politics
1,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,4.29,Duke University Press Books,5.500000,4.80000,3.10000,2.00000,1.0000,14,1,384,mani america greatest protest preacherspaul ti...,1,2005-02-03,7298,Romance
2,The Last Sorcerer,Ethan Russo,4.00,Haworth Integrative Healing Press,5.100000,4.00000,3.10000,2.00000,1.0000,2,0,368,unknown,0,2001-01-01,8792,Unknown
3,The Idea of a University,John Henry Newman,4.12,Yale University Press,5.286000,4.20700,3.96000,2.38000,1.1200,639,6,400,sinc public almost 150 year ago idea univers e...,6,1996-12-05,10280,Non-fiction
4,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,3.00,Johns Hopkins University Press,5.000000,4.10000,3.20000,2.10000,1.0000,4,0,622,religi tradit rich larg forgotten heritag invo...,0,1997-08-11,10031,Religion & Spirituality
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125214,The Cat in the Hat Beginner Book Dictionary,P.D. Eastman,4.39,Random House Books for Young Readers,5.745000,4.20100,3.14500,2.39000,1.2300,1153,0,144,silli book serious purposeto help child recogn...,0,2007-12-01,6267,Unknown
125215,Oh Say Can You Say What's the Weather Today?: ...,Tish Rabe,4.19,Random House Books for Young Readers,5.153000,4.10700,3.59000,2.13000,1.2000,334,3,48,cat compani travel hot air balloon various wea...,3,2004-12-05,7358,Travel
125216,Jewels on the Crescent: Masterpieces of the Ch...,Kalpana Desai,5.00,Mapin Publishing Pvt,5.300000,4.00000,3.00000,2.00000,1.0000,3,0,356,situat famous crescent site near gateway india...,0,2008-01-01,6236,Unknown
125217,Goodnight Moon,Margaret Wise Brown,4.28,HarperCollins,5.167387,4.67858,3.42052,2.11835,1.6265,295397,40,32,classic child literatur belov generat reader l...,40,2007-12-01,6267,Classic


In [67]:
# Save the DataFrame to a CSV file
final_dataframe.to_csv('books_metadata.csv', index=False)  # Replace 'books_metadata.csv' with the desired file name

print("DataFrame saved as 'books_metadata.csv'")

DataFrame saved as 'books_metadata.csv'


# PREPROCESSING

In [84]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
encoded_metadata = encoder.fit_transform(final_dataframe[['Authors', 'Publisher','Genre']])

import datetime

# Calculate days since publication
final_dataframe['DaysSincePublished'] = (datetime.datetime.now() - final_dataframe['CorrectedPublishDate']).dt.days

# Include this in numeric features
numeric_features = final_dataframe[['Rating', 'RatingDist5', 'RatingDist4', 'RatingDist3',
                                    'RatingDist2', 'RatingDist1', 'RatingDistTotal',
                                    'CountsOfReview', 'pagesNumber', 'Count of text reviews',
                                    'DaysSincePublished']]

# Normalize numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_features_normalized = scaler.fit_transform(numeric_features)



# 1. What is TF-IDF?

TF-IDF stands for:

Term Frequency (TF): Measures how often a word appears in a document.

Inverse Document Frequency (IDF): Measures the importance of a word by reducing the weight of commonly occurring words across the corpus.

The combination of TF and IDF gives a score that highlights words that are frequent in a document but rare across the corpus, making them more significant.



In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # Adjust max_features if needed

# Transform the Description column into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(final_dataframe['Description'])

# Output: tfidf_matrix is a sparse matrix of shape (n_samples, n_features)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"First 50 features: {tfidf_vectorizer.get_feature_names_out()[:50]}")


TF-IDF matrix shape: (125219, 5000)
Number of features: 5000
First 50 features: ['10' '100' '1000' '101' '11' '12' '13' '14' '15' '150' '16' '17' '18'
 '18th' '19' '1920s' '1930s' '1939' '1940' '1940s' '1941' '1942' '1944'
 '1945' '1947' '1948' '1950' '1950s' '1960' '1960s' '1965' '1967' '1968'
 '1970' '1970s' '1972' '1973' '1975' '1976' '1977' '1978' '1979' '1980'
 '1980s' '1982' '1983' '1984' '1985' '1986' '1987']


In [89]:
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Numeric features shape: {numeric_features_normalized.shape}")
print(f"Encoded metadata shape: {encoded_metadata.shape}")


TF-IDF matrix shape: (125219, 5000)
Numeric features shape: (125219, 11)
Encoded metadata shape: (125219, 93017)


In [90]:
from scipy.sparse import hstack

# Combine TF-IDF features, normalized numeric features, and encoded metadata
combined_features = hstack([tfidf_matrix, numeric_features_normalized, encoded_metadata])
print(f"Combined feature matrix shape: {combined_features.shape}")

Combined feature matrix shape: (125219, 98028)


Purpose: Combines all preprocessed feature matrices (TF-IDF matrix, normalized numeric features, and one-hot encoded metadata) into a single sparse matrix for similarity computation.

Steps:

hstack: Horizontally stacks the following matrices:

tfidf_matrix: Matrix representing text features from TF-IDF.

numeric_features_normalized: Scaled numeric features.

encoded_metadata: One-hot encoded metadata features.

csr_matrix: Converts the combined matrix to Compressed Sparse Row (CSR) format for efficient row slicing when computing cosine similarity.

Print Shape: Displays the shape of the combined matrix (rows = books, columns = all features).



In [77]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import hstack, csr_matrix

# Combine all preprocessed features into a single sparse matrix
combined_features = hstack([tfidf_matrix, numeric_features_normalized, encoded_metadata])
combined_features = csr_matrix(combined_features)  # Convert to CSR format for efficient row slicing
print(f"Combined feature matrix shape: {combined_features.shape}")

# Function to calculate top-N similar items for a specific index
def get_top_n_recommendations(index, feature_matrix, n=10):
    """
    Compute the top-N most similar items to the given index in the feature matrix.
    Args:
    - index (int): The row index for which to find similar items.
    - feature_matrix (csr_matrix): Combined feature matrix (TF-IDF + metadata).
    - n (int): Number of recommendations to return.
    Returns:
    - List of tuples (index, similarity_score).
    """
    # Compute cosine similarity for the given index
    sim_scores = cosine_similarity(feature_matrix[index], feature_matrix).flatten()

    # Get indices of top-N items, excluding the item itself
    top_indices = np.argsort(-sim_scores)[:n + 1]  # Add 1 to account for the item itself
    top_indices = [i for i in top_indices if i != index][:n]  # Exclude itself

    # Return the top-N indices and their similarity scores
    return [(i, sim_scores[i]) for i in top_indices]

# Example: Get recommendations for specific books
example_indices = [0, 10, 100]  # Replace with indices of books to get recommendations for

# Iterate over example books and display their recommendations
for idx in example_indices:
    top_recommendations = get_top_n_recommendations(idx, combined_features, n=10)

    print(f"Top Recommendations for Book Index {idx}:")
    for rec_idx, score in top_recommendations:
        print(f"  Book Index: {rec_idx}, Similarity Score: {score:.4f}")
    print("-" * 40)


Combined feature matrix shape: (125219, 97992)
Top Recommendations for Book Index 0:
  Book Index: 118618, Similarity Score: 0.9387
  Book Index: 77583, Similarity Score: 0.9369
  Book Index: 43771, Similarity Score: 0.9326
  Book Index: 58251, Similarity Score: 0.9298
  Book Index: 43768, Similarity Score: 0.9260
  Book Index: 124242, Similarity Score: 0.9202
  Book Index: 19540, Similarity Score: 0.9201
  Book Index: 81492, Similarity Score: 0.9135
  Book Index: 118248, Similarity Score: 0.9025
  Book Index: 12485, Similarity Score: 0.9005
----------------------------------------
Top Recommendations for Book Index 10:
  Book Index: 78750, Similarity Score: 0.5265
  Book Index: 42264, Similarity Score: 0.5199
  Book Index: 91982, Similarity Score: 0.5145
  Book Index: 12595, Similarity Score: 0.5118
  Book Index: 111817, Similarity Score: 0.4911
  Book Index: 35565, Similarity Score: 0.4804
  Book Index: 122452, Similarity Score: 0.4734
  Book Index: 111818, Similarity Score: 0.4718
 

Cosine Similarity Definition

Cosine Similarity is a metric used to measure how similar two vectors are, irrespective of their magnitude. It calculates the cosine of the angle between two non-zero vectors in a multi-dimensional space. The value of cosine similarity ranges from -1 to 1,

where:

1: The vectors are identical in direction (completely similar).

0: The vectors are orthogonal (no similarity).

-1: The vectors are diametrically opposite in direction (completely dissimilar).

In [72]:
def display_recommendations(index, feature_matrix, n, dataframe):
    """
    Display the top N recommendations for a given book index, including book titles and metadata.

    Parameters:
    - index: Index of the book for which recommendations are to be generated.
    - feature_matrix: Combined feature matrix (sparse format).
    - n: Number of recommendations to display.
    - dataframe: Original dataframe containing book titles and metadata.

    Returns:
    - None
    """
    # Compute cosine similarity for the given index
    sim_scores = cosine_similarity(feature_matrix[index], feature_matrix).flatten()

    # Get indices of top-N items, excluding the item itself
    top_indices = sim_scores.argsort()[::-1][1:n+1]

    # Display the recommendations with details
    print(f"Top Recommendations for Book Index {index}:")
    for idx in top_indices:
        name = dataframe.iloc[idx]['Name']
        author = dataframe.iloc[idx]['Authors']
        publisher = dataframe.iloc[idx]['Publisher']
        sim_score = sim_scores[idx]

        display(f"  Title: {name}, Author: {author}, Publisher: {publisher}, Similarity Score: {sim_score:.2f}")
    print("-" * 200)

# Example usage
example_indices = [0, 10, 100]  # Indices of books to get recommendations for
for idx in example_indices:
    display_recommendations(idx, combined_features, n=10, dataframe=final_dataframe)


Top Recommendations for Book Index 0:


'  Title: The Prince, Author: Niccolò Machiavelli, Publisher: Yale University Press, Similarity Score: 0.94'

'  Title: The Prince and Other Political Writings, Author: Niccolò Machiavelli, Publisher: Tuttle Publishing, Similarity Score: 0.94'

'  Title: Le prince, Author: Niccolò Machiavelli, Publisher: Nathan, Similarity Score: 0.93'

'  Title: The Prince, Author: Niccolò Machiavelli, Publisher: IndyPublish.com, Similarity Score: 0.93'

'  Title: Le Prince : Texte intégral, analyse, Author: Niccolò Machiavelli, Publisher: Hatier, Similarity Score: 0.93'

'  Title: Der Fürst, Author: Niccolò Machiavelli, Publisher: Insel, Frankfurt, Similarity Score: 0.92'

'  Title: El Principe, Author: Niccolò Machiavelli, Publisher: Alianza, Similarity Score: 0.92'

'  Title: El Principe/ The Prince: Comentado Por Napoleon Bonaparte, Author: Niccolò Machiavelli, Publisher: Andromeda Publications, Similarity Score: 0.91'

'  Title: Il Principe / Der Fürst, Author: Niccolò Machiavelli, Publisher: Reclam, Similarity Score: 0.90'

"  Title: Island of the Blue Dolphins, Author: Scott O'Dell, Publisher: Laurel Leaf, Similarity Score: 0.90"

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Top Recommendations for Book Index 10:


'  Title: The Baku Documents: A Complete Catalogue of Persian, Azeri, Ottoman and Arabic Newspapers and Journals in Libraries of Azerbaijan, Author: Touraj Atabaki, Publisher: I. B. Tauris, Similarity Score: 0.53'

'  Title: Kilimanjaro Tales: The Saga of A Medical Family in Africa, Author: Gwynneth Latham, Publisher: I. B. Tauris, Similarity Score: 0.52'

'  Title: Chateaux of the Loire, Author: Thorsten Droste, Publisher: I. B. Tauris, Similarity Score: 0.51'

'  Title: Islam and Romantic Orientalism: Literary Encounters with the Orient, Author: Mohammed Sharafuddin, Publisher: I. B. Tauris, Similarity Score: 0.51'

'  Title: Sir Garfield Todd and the Making of Zimbabwe, Author: Ruth Weiss, Publisher: I. B. Tauris, Similarity Score: 0.49'

'  Title: Politics in An Arabian Oasis: The Rashidis of Saudi Arabia, Author: Madawi Al Rasheed, Publisher: I. B. Tauris, Similarity Score: 0.48'

"  Title: Ba'th Versus Ba'th: The Conflict Between Syria and Iraq, 1968-1989, Author: Eberhard Kienle, Publisher: I. B. Tauris, Similarity Score: 0.47"

'  Title: Peace in Their Time, Author: Ruth Weiss, Publisher: I. B. Tauris, Similarity Score: 0.47'

'  Title: The Jews of Europe and the Inquisition of Venice: 1550-1620, Author: Brian S. Pullan, Publisher: I. B. Tauris, Similarity Score: 0.45'

'  Title: Reaching for the Stars: A New History of Bomber Command in World War II, Author: Mark Connelly, Publisher: I. B. Tauris, Similarity Score: 0.45'

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Top Recommendations for Book Index 100:


'  Title: The Semblance of Peace: The Political Settlement After the Second World War, Author: John Wheeler-Bennett, Publisher: W. W. Norton  Company, Similarity Score: 0.75'

'  Title: Carbonate Sediments and Their Diagenesis, Author: R.G.C. Bathurst, Publisher: Elsevier Science & Technology, Similarity Score: 0.75'

'  Title: Practical Scanning Electron Microscopy: Electron and Ion Microprobe Analysis, Author: Joseph I. Goldstein, Publisher: Springer, Similarity Score: 0.75'

'  Title: The Papers of Ulysses S. Grant, Volume 3: October 1, 1861-January 7, 1862, Author: Ulysses S. Grant, Publisher: Southern Illinois University Press, Similarity Score: 0.75'

'  Title: The Restoration Mode From Milton To Dryden, Author: Earl Roy Miner, Publisher: Princeton University Press, Similarity Score: 0.74'

'  Title: A Treatise on Limnology, Volume 1, Author: G. Evelyn Hutchinson, Publisher: Wiley, Similarity Score: 0.74'

'  Title: Journals and Miscellaneous Notebooks of Ralph Waldo Emerson, Volume V: 1835-1838, Author: Ralph Waldo Emerson, Publisher: Belknap Press, Similarity Score: 0.74'

'  Title: Word-By-Word Translations of Songs and Arias, Part I: German and French, Author: Berton Coffin, Publisher: Scarecrow Press, Similarity Score: 0.74'

'  Title: Catalogue Of The Byzantine Coins In The Dumbarton Oaks Collection And In The Whittemore Collection, Author: Alfred Raymond Bellinger, Publisher: Dumbarton Oaks Research Library & Collection, Similarity Score: 0.74'

'  Title: Nikolai Leskov: The Man and His Art, Author: Hugh McLean, Publisher: Harvard University Press, Similarity Score: 0.74'

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Evaluate the recommendations

In [73]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Function to evaluate recommendations
def evaluate_recommendations(true_recommendations, predicted_recommendations):
    """
    Evaluate the recommendations using precision, recall, and F1-score.

    Parameters:
    - true_recommendations (list): List of true recommended books (ground truth).
    - predicted_recommendations (list): List of predicted recommended books.
    """

    # Convert to binary (1 if recommended, 0 if not) to calculate precision, recall, and F1-score
    true_set = set(true_recommendations)  # Convert ground truth to set for easy comparison
    predicted_set = set(predicted_recommendations)  # Convert predicted to set

    # Compute Precision: Proportion of relevant items among recommended
    precision = len(true_set.intersection(predicted_set)) / len(predicted_set) if len(predicted_set) > 0 else 0

    # Compute Recall: Proportion of relevant items recommended out of all relevant items
    recall = len(true_set.intersection(predicted_set)) / len(true_set) if len(true_set) > 0 else 0

    # Compute F1-Score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# Example usage
# Suppose we have some ground truth (actual relevant books for the user)
true_recommendations = [118618, 77583, 43771, 58251, 43768]  # Example ground truth (book indices)
# And the system has predicted these books as recommendations
predicted_recommendations = [118618, 77583, 43771, 58251, 43768]  # Example predictions (book indices)

# Evaluate the recommendations
evaluate_recommendations(true_recommendations, predicted_recommendations)



Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
