In [114]:
import pandas as pd
import numpy as np

## Load Data

In [115]:
data = pd.read_csv("Best_Books_Ever.csv")
print(data.columns)
# data

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')


## Drop Unnecesary Columns And Change NaN to self-concluding when a book doesnt belong to a serie 

In [116]:
data_col_dropped = data.drop("characters", axis=1).drop("numRatings", axis=1).drop("publisher", axis=1).drop("bbeScore", axis=1).drop("bbeVotes", axis=1).drop("bookFormat", axis=1).drop("firstPublishDate", axis=1).drop("description", axis=1).drop("bookId", axis=1).drop("isbn", axis=1).drop("edition", axis=1).drop("awards", axis=1).drop("ratingsByStars", axis=1).drop("setting", axis=1).drop("coverImg", axis=1)
data_col_dropped["series"] = data_col_dropped["series"].fillna('self-concluding')
data_col_dropped

Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,09/14/08,96.0,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.50,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,09/28/04,98.0,7.38
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,English,"['Classics', 'Fiction', 'Historical Fiction', ...",324,05/23/06,95.0,
3,Pride and Prejudice,self-concluding,"Jane Austen, Anna Quindlen (Introduction)",4.26,English,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/00,94.0,
4,Twilight,The Twilight Saga #1,Stephenie Meyer,3.60,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,09/06/06,78.0,2.1
...,...,...,...,...,...,...,...,...,...,...
52473,Fractured,Fateful #2,Cheri Schmidt (Goodreads Author),4.00,English,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,May 28th 2011,94.0,
52474,Anasazi,Sense of Truth #2,Emma Michaels,4.19,English,"['Mystery', 'Young Adult']",190,August 5th 2011,95.0,
52475,Marked,Soul Guardians #1,Kim Richardson (Goodreads Author),3.70,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,84.0,7.37
52476,Wayward Son,self-concluding,"Tom Pollack (Goodreads Author), John Loftus (G...",3.85,English,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,90.0,2.86


## Simple Query to search by author

In [117]:
data_col_dropped[data_col_dropped["author"].str.contains("Patrick Rothfuss", case=False, na=False)].sort_values("likedPercent", ascending=False)

Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
308,The Wise Man's Fear,The Kingkiller Chronicle #2,Patrick Rothfuss (Goodreads Author),4.56,English,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",994.0,03/01/11,98.0,16.37
15033,O Medo do Homem Sábio - Parte 1,"The Kingkiller Chronicle #2, Part 1 of 2","Patrick Rothfuss (Goodreads Author), Renato Ca...",4.52,Portuguese,"['Fantasy', 'High Fantasy', 'Epic Fantasy', 'F...",703.0,September 21st 2011,98.0,254.45
15429,O Medo do Homem Sábio - Parte 2,"The Kingkiller Chronicle #2, Part 2 of 2","Patrick Rothfuss (Goodreads Author), Renato Ca...",4.48,Portuguese,"['Fantasy', 'High Fantasy', 'Epic Fantasy', 'F...",684.0,November 28th 2011,97.0,254.45
110,The Name of the Wind,The Kingkiller Chronicle #1,Patrick Rothfuss (Goodreads Author),4.53,English,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",662.0,04/28/07,96.0,9.36
21087,Tuule nimi I raamat,"The Kingkiller Chronicle #1, Part 1 of 2","Patrick Rothfuss (Goodreads Author), Juhan Hab...",4.21,Estonian,"['Fantasy', 'Adventure', 'Fiction', 'Young Adu...",367.0,2008,93.0,
2385,The Slow Regard of Silent Things,The Kingkiller Chronicle #2.5,"Patrick Rothfuss (Goodreads Author), Nate Tayl...",3.9,English,"['Fantasy', 'Fiction', 'Short Stories', 'High ...",159.0,October 28th 2014,89.0,9.58
16385,Doors of Stone,The Kingkiller Chronicle #3,Patrick Rothfuss (Goodreads Author),3.72,English,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",,Published,73.0,


#### Simple method to extract author

In [118]:
def extract_author(authors: str) -> str:
  author = authors.split(', ')[0]
  if "(" not in author:
    return author
  else:
    return author.split(' (')[0]
    

## Simplify authors

In [119]:
data_col_dropped["author"] = data_col_dropped["author"].apply(extract_author)
data_col_dropped

Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
0,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,09/14/08,96.0,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter #5,J.K. Rowling,4.50,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,09/28/04,98.0,7.38
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,English,"['Classics', 'Fiction', 'Historical Fiction', ...",324,05/23/06,95.0,
3,Pride and Prejudice,self-concluding,Jane Austen,4.26,English,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/00,94.0,
4,Twilight,The Twilight Saga #1,Stephenie Meyer,3.60,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,09/06/06,78.0,2.1
...,...,...,...,...,...,...,...,...,...,...
52473,Fractured,Fateful #2,Cheri Schmidt,4.00,English,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,May 28th 2011,94.0,
52474,Anasazi,Sense of Truth #2,Emma Michaels,4.19,English,"['Mystery', 'Young Adult']",190,August 5th 2011,95.0,
52475,Marked,Soul Guardians #1,Kim Richardson,3.70,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,84.0,7.37
52476,Wayward Son,self-concluding,Tom Pollack,3.85,English,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,90.0,2.86


#### Simple method to clean series

In [120]:
def clean_series(series: str) -> str:
  if "#" in series:
    return series.split(' #')[0]
  return series

## Clean series

In [121]:
data_col_dropped["series"] = data_col_dropped["series"].apply(clean_series)
data_col_dropped

Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
0,The Hunger Games,The Hunger Games,Suzanne Collins,4.33,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,09/14/08,96.0,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling,4.50,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,09/28/04,98.0,7.38
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,English,"['Classics', 'Fiction', 'Historical Fiction', ...",324,05/23/06,95.0,
3,Pride and Prejudice,self-concluding,Jane Austen,4.26,English,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/00,94.0,
4,Twilight,The Twilight Saga,Stephenie Meyer,3.60,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,09/06/06,78.0,2.1
...,...,...,...,...,...,...,...,...,...,...
52473,Fractured,Fateful,Cheri Schmidt,4.00,English,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,May 28th 2011,94.0,
52474,Anasazi,Sense of Truth,Emma Michaels,4.19,English,"['Mystery', 'Young Adult']",190,August 5th 2011,95.0,
52475,Marked,Soul Guardians,Kim Richardson,3.70,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,84.0,7.37
52476,Wayward Son,self-concluding,Tom Pollack,3.85,English,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,90.0,2.86


## Extract every author

In [122]:
authors = set()
for book in data_col_dropped.values:
  book_authors : str = book[2]
  for author in book_authors.split(', '):
    if "(" in author and "(Goodreads Author)" not in author:
      continue
    authors.add(author.removesuffix(" (Goodreads Author)"))
print(f'Total Authors: {len(authors)}')

Total Authors: 23195


## Simple example that show every book from a random author

In [123]:
from random import choice as rdc
author = rdc(list(authors))
print(author)
data_col_dropped[data_col_dropped["author"].str.contains(author, case=False, na=False)]

Pavol Rankov


Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
35319,Zdarzyło się pierwszego września (albo kiedy i...,self-concluding,Pavol Rankov,4.21,Polish,"['Historical Fiction', 'Fiction', 'War', 'Czec...",468,2013,97.0,


### Drop every row that has some NaN or null features and filter books that are in English and from them, remove books that doesnt have any genre

In [124]:
data_NaN_dropped = data_col_dropped.dropna()
data_english_sorted = data_NaN_dropped[(data_NaN_dropped["language"] == "English")]
data_genres_not_empty = data_english_sorted[data_english_sorted["genres"].apply(lambda x: x.removeprefix('[').removesuffix(']') != '')]
data_genres_not_empty

Unnamed: 0,title,series,author,rating,language,genres,pages,publishDate,likedPercent,price
0,The Hunger Games,The Hunger Games,Suzanne Collins,4.33,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,09/14/08,96.0,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling,4.50,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,09/28/04,98.0,7.38
4,Twilight,The Twilight Saga,Stephenie Meyer,3.60,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,09/06/06,78.0,2.1
5,The Book Thief,self-concluding,Markus Zusak,4.37,English,"['Historical Fiction', 'Fiction', 'Young Adult...",552,03/14/06,96.0,3.8
6,Animal Farm,self-concluding,George Orwell,3.95,English,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",141,04/28/96,91.0,4.42
...,...,...,...,...,...,...,...,...,...,...
52470,Attracted to Fire,self-concluding,DiAnn Mills,4.14,English,"['Christian Fiction', 'Christian', 'Suspense',...",416,October 1st 2011,95.0,5.55
52472,Unbelievable,Port Fare,Sherry Gammon,4.16,English,"['Romance', 'Young Adult', 'Contemporary', 'Co...",360,April 11th 2013,94.0,19.18
52475,Marked,Soul Guardians,Kim Richardson,3.70,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,84.0,7.37
52476,Wayward Son,self-concluding,Tom Pollack,3.85,English,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,90.0,2.86


#### Simple function made to extract the year from a date

In [125]:
def extract_year_from_date(date: str) -> str:
  if "/" in date:
    year = date[-2:]
    return f"20{year}" if int(year) < 24 else f"19{year}"
  else:
    return date[-4:]

## Create the column Publish Year and extract the year from Publish Date

In [126]:
data_genres_not_empty['publishYear'] = data_genres_not_empty['publishDate'].apply(extract_year_from_date)
data_genres_not_empty = data_genres_not_empty[['title', 'series', 'author', 'language', 'genres', 'pages', 'publishDate', 'publishYear', 'rating', 'likedPercent', 'price']]
data_genres_not_empty

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_genres_not_empty['publishYear'] = data_genres_not_empty['publishDate'].apply(extract_year_from_date)


Unnamed: 0,title,series,author,language,genres,pages,publishDate,publishYear,rating,likedPercent,price
0,The Hunger Games,The Hunger Games,Suzanne Collins,English,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,09/14/08,2008,4.33,96.0,5.09
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling,English,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,09/28/04,2004,4.50,98.0,7.38
4,Twilight,The Twilight Saga,Stephenie Meyer,English,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,09/06/06,2006,3.60,78.0,2.1
5,The Book Thief,self-concluding,Markus Zusak,English,"['Historical Fiction', 'Fiction', 'Young Adult...",552,03/14/06,2006,4.37,96.0,3.8
6,Animal Farm,self-concluding,George Orwell,English,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",141,04/28/96,1996,3.95,91.0,4.42
...,...,...,...,...,...,...,...,...,...,...,...
52470,Attracted to Fire,self-concluding,DiAnn Mills,English,"['Christian Fiction', 'Christian', 'Suspense',...",416,October 1st 2011,2011,4.14,95.0,5.55
52472,Unbelievable,Port Fare,Sherry Gammon,English,"['Romance', 'Young Adult', 'Contemporary', 'Co...",360,April 11th 2013,2013,4.16,94.0,19.18
52475,Marked,Soul Guardians,Kim Richardson,English,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,2011,3.70,84.0,7.37
52476,Wayward Son,self-concluding,Tom Pollack,English,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,2011,3.85,90.0,2.86


### To process words in ML they have to be remplaced by numbers, so to reach that, we need to associate a word to a number

#### Genre to Number and vice versa

In [127]:
genre_to_num = {}
num_to_genre = {}

total_genres = 0
visited = set()

for genres in data_genres_not_empty["genres"]:
  list_of_genres = genres.removeprefix('[').removesuffix(']').split(', ')
  for genre in list_of_genres:
    if genre not in visited:
      genre_cleanned = genre[1:][:-1]
      num_to_genre[total_genres] = genre_cleanned
      genre_to_num[genre_cleanned] = total_genres
      total_genres += 1
      visited.add(genre)

print(f'Total Genres: {total_genres}')
num_to_genre

Total Genres: 944


{0: 'Young Adult',
 1: 'Fiction',
 2: 'Dystopia',
 3: 'Fantasy',
 4: 'Science Fiction',
 5: 'Romance',
 6: 'Adventure',
 7: 'Teen',
 8: 'Post Apocalyptic',
 9: 'Action',
 10: 'Magic',
 11: 'Childrens',
 12: 'Audiobook',
 13: 'Middle Grade',
 14: 'Classics',
 15: 'Science Fiction Fantasy',
 16: 'Vampires',
 17: 'Paranormal',
 18: 'Paranormal Romance',
 19: 'Supernatural',
 20: 'Urban Fantasy',
 21: 'Historical Fiction',
 22: 'Historical',
 23: 'War',
 24: 'Holocaust',
 25: 'World War II',
 26: 'Books About Books',
 27: 'Literature',
 28: 'Politics',
 29: 'School',
 30: 'Novels',
 31: 'Read For School',
 32: 'Epic Fantasy',
 33: 'High Fantasy',
 34: 'Civil War',
 35: 'Historical Romance',
 36: 'Picture Books',
 37: 'Poetry',
 38: 'Juvenile',
 39: 'Kids',
 40: 'Short Stories',
 41: 'Gothic',
 42: '19th Century',
 43: 'Classic Literature',
 44: 'Japan',
 45: 'Adult',
 46: 'Asia',
 47: 'Adult Fiction',
 48: 'British Literature',
 49: 'High School',
 50: 'Plays',
 51: 'Drama',
 52: 'Philosop

In [134]:
data_clean = data_genres_not_empty.head(1000)
data_clean = data_clean.sample(frac=1).reset_index(drop=True)
data_clean = data_clean.drop("language", axis=1)
data_clean.to_csv('BestBooksEverClean.csv', index=None)

In [135]:
data_to_train = data_clean.drop("publishDate", axis=1)
data_to_train.to_csv('BestBooksEverClean_train_dataset.csv', index=None)