In [1]:
import pandas as pd
import numpy as np

import re
import string
# BERT-Embeddings
from keybert import KeyBERT
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
#import altair as alt
#alt.renderers.enable('mimetype')
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessing Data

In [31]:
books_data = pd.read_csv("data/book1100k-1200k.csv", usecols=['Id', 'Name', 'Authors', 'ISBN', 'PublishYear', 'Publisher', 'Language', 'Description', 'Rating','pagesNumber'])
display(books_data.shape)


books_data.head(20)

(41892, 10)

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,Publisher,Language,pagesNumber,Description
0,1100003,The Prince,Niccolò Machiavelli,226500438.0,3.82,1998,University of Chicago Press,,151,"The most famous book on politics ever written,..."
1,1100004,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,822334836.0,4.29,2005,Duke University Press Books,,384,Many of America’s greatest Protestant preacher...
2,1100007,The Last Sorcerer,Ethan Russo,789012707.0,4.0,2001,Haworth Integrative Healing Press,,368,
3,1100009,The Idea of a University,John Henry Newman,300064055.0,4.12,1996,Yale University Press,,400,"Since its publication almost 150 years ago, <i..."
4,1100010,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,801857961.0,3.0,1997,Johns Hopkins University Press,,622,"Most religious traditions have a rich, if larg..."
5,1100012,Inequality Reexamined [Electronic Resource],Amartya Sen,198289286.0,4.11,2007,Russell Sage Foundation; Clarendon Press,,222,
6,1100013,The Alamo Remembered: Tejano Accounts and Pers...,Timothy Matovina,292751850.0,3.77,1995,University of Texas Press,,146,As Mexican soldiers fought the mostly Anglo-Am...
7,1100020,Rousseau: 'the Discourses' and Other Early Pol...,Jean-Jacques Rousseau,521413818.0,4.05,1997,Cambridge University Press,,437,The work of Jean-Jacques Rousseau is presented...
8,1100021,"How Not to Study Judaism, Examples and Counter...",Jacob Neusner,761827838.0,1.0,2004,University Press of America,,176,"In How Not to Study Judaism, Examples and Coun..."
9,1100025,Basic Research and Technologies for Two-Stage-...,Dieter Jacob,3527277358.0,5.0,2005,Wiley-Vch,,683,Focusing on basic aspects of future reusable s...


# Feature Selection
1. ISBN is not useable
2. Language have too many NaN

In [32]:
books_data = books_data.loc[:, ['Id', 'Name', 'Authors', 'PublishYear', 'Publisher', 'Description', 'Rating','pagesNumber']]
books_data

Unnamed: 0,Id,Name,Authors,PublishYear,Publisher,Description,Rating,pagesNumber
0,1100003,The Prince,Niccolò Machiavelli,1998,University of Chicago Press,"The most famous book on politics ever written,...",3.82,151
1,1100004,"Sermons from Duke Chapel: Voices from ""A Great...",William H. Willimon,2005,Duke University Press Books,Many of America’s greatest Protestant preacher...,4.29,384
2,1100007,The Last Sorcerer,Ethan Russo,2001,Haworth Integrative Healing Press,,4.00,368
3,1100009,The Idea of a University,John Henry Newman,1996,Yale University Press,"Since its publication almost 150 years ago, <i...",4.12,400
4,1100010,Caring and Curing: Health and Medicine in the ...,Ronald L. Numbers,1997,Johns Hopkins University Press,"Most religious traditions have a rich, if larg...",3.00,622
...,...,...,...,...,...,...,...,...
41887,1199988,Family: Everyday Stories About the Miracle of ...,Mary Pesaresi,1996,Prima Lifestyles,Family is a beautiful gift collection of 60 re...,5.00,288
41888,1199990,Snail Eggs & Samphire: Dispatches from the Foo...,Derek Cooper,2001,Pan Books,Derek Cooper's career in journalism has focuse...,4.17,422
41889,1199992,Snail Eggs And Samphire: Dispatches From The F...,Derek Cooper,2000,MacMillan,Derek Cooper's career in journalism has focuse...,4.17,422
41890,1199993,The Confession (The Yalta Boulevard Sequence #2),Olen Steinhauer,2005,Minotaur Books,"Eastern Europe, 1956: Comrade Inspector Ferenc...",3.87,336


In [33]:
## Remove URLs and HTML Tags and Punctuations from the Description.
# **Description** feature contains URLs, HTML tags and punctuations.
# - Before changing the letter case, assign missing **Publisher** some temporary string `unknown` to retain these missing values during string transformation. 
books_data.dropna(subset=["Description"], inplace=True)
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_url(text):   
    return re.sub(url_pattern, r'', text)

html_pattern = re.compile('<[^>]*>')
def clean_html_tags(text):
    return re.sub(html_pattern, r'', text)

punctuations = string.punctuation
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', punctuations))

books_data.Description = books_data.Description.apply(remove_url)
books_data.Description = books_data.Description.apply(clean_html_tags)
books_data.Description = books_data.Description.apply(remove_punctuations)
books_data[["Publisher"]] = books_data[["Publisher"]].fillna("unknown")
books_data[["Name", "Authors", "Publisher", "Description"]] = pd.concat([books_data[col].astype(str).str.lower().str.strip() 
                                                                             for col in ["Name", "Authors", "Publisher", "Description"]], 
                                                                            axis=1)
books_data


Unnamed: 0,Id,Name,Authors,PublishYear,Publisher,Description,Rating,pagesNumber
0,1100003,the prince,niccolò machiavelli,1998,university of chicago press,the most famous book on politics ever written ...,3.82,151
1,1100004,"sermons from duke chapel: voices from ""a great...",william h. willimon,2005,duke university press books,many of america’s greatest protestant preacher...,4.29,384
3,1100009,the idea of a university,john henry newman,1996,yale university press,since its publication almost 150 years ago the...,4.12,400
4,1100010,caring and curing: health and medicine in the ...,ronald l. numbers,1997,johns hopkins university press,most religious traditions have a rich if large...,3.00,622
6,1100013,the alamo remembered: tejano accounts and pers...,timothy matovina,1995,university of texas press,as mexican soldiers fought the mostly angloame...,3.77,146
...,...,...,...,...,...,...,...,...
41887,1199988,family: everyday stories about the miracle of ...,mary pesaresi,1996,prima lifestyles,family is a beautiful gift collection of 60 re...,5.00,288
41888,1199990,snail eggs & samphire: dispatches from the foo...,derek cooper,2001,pan books,derek coopers career in journalism has focused...,4.17,422
41889,1199992,snail eggs and samphire: dispatches from the f...,derek cooper,2000,macmillan,derek coopers career in journalism has focused...,4.17,422
41890,1199993,the confession (the yalta boulevard sequence #2),olen steinhauer,2005,minotaur books,eastern europe 1956 comrade inspector ferenc k...,3.87,336


In [34]:
## Transform Book and Author Names into Single Token
# - Merge first and last name of authors (with `_`) so that two authors with same first or last name are not considered same when the tokenization happens.
# - Also merge name of the book so that it is considered as single token during the processing.
# Strip whitespace and replace spaces with underscores using .loc
books_data.loc[:, "Authors"] = books_data["Authors"].str.strip().str.replace(' ', '_')
books_data.loc[:, "Publisher"] = books_data["Publisher"].str.strip().str.replace(' ', '_')
# Limit to the first 100 rows
books_data = books_data[:200]

len(books_data)


200

In [35]:
# Display the first 5 rows
books_data.head(5)

Unnamed: 0,Id,Name,Authors,PublishYear,Publisher,Description,Rating,pagesNumber
0,1100003,the prince,niccolò_machiavelli,1998,university_of_chicago_press,the most famous book on politics ever written ...,3.82,151
1,1100004,"sermons from duke chapel: voices from ""a great...",william_h._willimon,2005,duke_university_press_books,many of america’s greatest protestant preacher...,4.29,384
3,1100009,the idea of a university,john_henry_newman,1996,yale_university_press,since its publication almost 150 years ago the...,4.12,400
4,1100010,caring and curing: health and medicine in the ...,ronald_l._numbers,1997,johns_hopkins_university_press,most religious traditions have a rich if large...,3.0,622
6,1100013,the alamo remembered: tejano accounts and pers...,timothy_matovina,1995,university_of_texas_press,as mexican soldiers fought the mostly angloame...,3.77,146


# Similarity between two books by TF-IDF
1. USE keyBERT to extract the relevant keywords from the Description of the book.
2. Use TF-IDF to calculate the similarity between the keywords

In [36]:
kw_model = KeyBERT()

def get_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words="english")
    keywords = " ".join([k[0] for k in keywords])
    return keywords

books_data["keywords"] = books_data.Description.apply(get_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_data["keywords"] = books_data.Description.apply(get_keywords)


In [37]:
len(books_data)

200

In [20]:
books_data['keywords'].head(10)

0     machiavellis machiavelli prince tyranny biblio...
1              sermons sermon preacher pulpit preachers
3      scholars universities university academic newman
4     judeochristian medicine religious religion med...
6                   alamo tejanos 1836 tejano antonians
7     rousseau rousseaus writings jeanjacques revolu...
8               judaism jewish neusner literature study
9     aerodynamics aerodynamic propulsion hypersonic...
10            holocaust bergenbelsen herzberg nazi jews
12        spiritual seminary clergy congregation church
Name: keywords, dtype: object

In [38]:
books_data["features"] = books_data[['Authors', 'PublishYear', 'Publisher', 'Rating', 'pagesNumber']] \
    .fillna('') \
    .astype(str) \
    .agg(' '.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_data["features"] = books_data[['Authors', 'PublishYear', 'Publisher', 'Rating', 'pagesNumber']] \


In [39]:
books_data.head(10)

Unnamed: 0,Id,Name,Authors,PublishYear,Publisher,Description,Rating,pagesNumber,keywords,features
0,1100003,the prince,niccolò_machiavelli,1998,university_of_chicago_press,the most famous book on politics ever written ...,3.82,151,machiavellis machiavelli prince tyranny biblio...,niccolò_machiavelli 1998 university_of_chicago...
1,1100004,"sermons from duke chapel: voices from ""a great...",william_h._willimon,2005,duke_university_press_books,many of america’s greatest protestant preacher...,4.29,384,sermons sermon preacher pulpit preachers,william_h._willimon 2005 duke_university_press...
3,1100009,the idea of a university,john_henry_newman,1996,yale_university_press,since its publication almost 150 years ago the...,4.12,400,scholars universities university academic newman,john_henry_newman 1996 yale_university_press 4...
4,1100010,caring and curing: health and medicine in the ...,ronald_l._numbers,1997,johns_hopkins_university_press,most religious traditions have a rich if large...,3.0,622,judeochristian medicine religious religion med...,ronald_l._numbers 1997 johns_hopkins_universit...
6,1100013,the alamo remembered: tejano accounts and pers...,timothy_matovina,1995,university_of_texas_press,as mexican soldiers fought the mostly angloame...,3.77,146,alamo tejanos 1836 tejano antonians,timothy_matovina 1995 university_of_texas_pres...
7,1100020,rousseau: 'the discourses' and other early pol...,jean-jacques_rousseau,1997,cambridge_university_press,the work of jeanjacques rousseau is presented ...,4.05,437,rousseau rousseaus writings jeanjacques revolu...,jean-jacques_rousseau 1997 cambridge_universit...
8,1100021,"how not to study judaism, examples and counter...",jacob_neusner,2004,university_press_of_america,in how not to study judaism examples and count...,1.0,176,judaism jewish neusner literature study,jacob_neusner 2004 university_press_of_america...
9,1100025,basic research and technologies for two-stage-...,dieter_jacob,2005,wiley-vch,focusing on basic aspects of future reusable s...,5.0,683,aerodynamics aerodynamic propulsion hypersonic...,dieter_jacob 2005 wiley-vch 5.0 683
10,1100026,between two streams: a diary from bergen-belsen,abel_j._herzberg,1997,i._b._tauris,at the height of the holocaust it was nazi pol...,3.95,232,holocaust bergenbelsen herzberg nazi jews,abel_j._herzberg 1997 i._b._tauris 3.95 232
12,1100031,the lived experience of group spiritual direction,monica_maxon,2003,paulist_press,these moving positive essays witness the exper...,3.5,304,spiritual seminary clergy congregation church,monica_maxon 2003 paulist_press 3.5 304


In [40]:
books_data.to_csv("data/keywords.csv", sep=",", index=False)