# Get Final Dataset

## Join dataset with ratings and dataset with book descriptions

Source: https://github.com/zygmuntz/goodbooks-10k/releases/tag/v1.0

In [1]:
import gensim
import logging
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import random

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
ratings_df = pd.read_csv('ratings.csv')
books_df = pd.read_csv('books.csv')

In [4]:
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
ratings_df.shape

(5976479, 3)

In [6]:
#This code cell adds two columns of random numbers mapped from the user id. This is so the
#order of users can be shuffled. Shuffling data can help prevent the model from overfitting. 
# Idea noted from https://github.com/Santosh-Gupta/Lit2Vec/blob/master/GoodReadsDataClean.ipynb

mydict={n : n for n in random.sample(range(1, ratings_df['user_id'].max()+1), ratings_df['user_id'].max())}

randomColumn1 = dict()
i=1
for n in random.sample(range(1, ratings_df['user_id'].max()+1), ratings_df['user_id'].max()):
    randomColumn1.update({i:n})
    i=i+1
ratings_df['SortIndex1'] = ratings_df['user_id'].map(randomColumn1)
ratings_df['SortIndex2'] = np.random.randint(1, 500, ratings_df.shape[0])

In [7]:
ratings_df.head()

Unnamed: 0,user_id,book_id,rating,SortIndex1,SortIndex2
0,1,258,5,4101,451
1,2,4081,4,41929,262
2,2,260,5,41929,477
3,2,9296,5,41929,392
4,2,2318,3,41929,398


In [8]:
books_df.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [9]:
books_df = books_df.iloc[:,[0,1,5,6,9,11,12, 13]]

In [10]:
books_df.head()

Unnamed: 0,book_id,goodreads_book_id,isbn,isbn13,original_title,language_code,average_rating,ratings_count
0,1,2767052,439023483,9780439000000.0,The Hunger Games,eng,4.34,4780653
1,2,3,439554934,9780440000000.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479
2,3,41865,316015849,9780316000000.0,Twilight,en-US,3.57,3866839
3,4,2657,61120081,9780061000000.0,To Kill a Mockingbird,eng,4.25,3198671
4,5,4671,743273567,9780743000000.0,The Great Gatsby,eng,3.89,2683664


In [11]:
#cleaning: language code

As the name suggests ratings.csv contains all users’s ratings of the books (a total of 980k ratings, for 10,000 books, from 53,424 users), while books.csv contains more information on the books such as author, year, etc. book_tags contains all tag_ids users have assigned to that books and corresponding tag_counts, while tags.csv contains the tag_names corresponding to the tag_ids.
books_count is the number of editions for a given work.

In [12]:
books_df['language_code'].unique()

array(['eng', 'en-US', 'en-CA', nan, 'spa', 'en-GB', 'fre', 'nl', 'ara',
       'por', 'ger', 'nor', 'jpn', 'en', 'vie', 'ind', 'pol', 'tur',
       'dan', 'fil', 'ita', 'per', 'swe', 'rum', 'mul', 'rus'],
      dtype=object)

In [13]:
# replace 'en-US', 'en-CA', 'en-GB', 'en' with 'eng'
books_df['language_code'] = books_df['language_code'].replace(['en-US', 'en-CA', 'en-GB', 'en'],'eng')


In [14]:
books_df.head()

Unnamed: 0,book_id,goodreads_book_id,isbn,isbn13,original_title,language_code,average_rating,ratings_count
0,1,2767052,439023483,9780439000000.0,The Hunger Games,eng,4.34,4780653
1,2,3,439554934,9780440000000.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479
2,3,41865,316015849,9780316000000.0,Twilight,eng,3.57,3866839
3,4,2657,61120081,9780061000000.0,To Kill a Mockingbird,eng,4.25,3198671
4,5,4671,743273567,9780743000000.0,The Great Gatsby,eng,3.89,2683664


In [15]:
books_df = books_df[books_df['language_code']=='eng']

In [16]:
books_df = books_df.drop('language_code', axis=1)

In [17]:
ratingsbooks_df = pd.merge(ratings_df, books_df, on="book_id", how="left", sort=False)


In [18]:
ratingsbooks_df.shape

(5976479, 11)

In [19]:
ratingsbooks_df.head()

Unnamed: 0,user_id,book_id,rating,SortIndex1,SortIndex2,goodreads_book_id,isbn,isbn13,original_title,average_rating,ratings_count
0,1,258,5,4101,451,1232.0,143034901.0,9780143000000.0,La sombra del viento,4.24,263685.0
1,2,4081,4,41929,262,231.0,312424442.0,9780312000000.0,,3.4,19293.0
2,2,260,5,41929,477,4865.0,,9780672000000.0,How to Win Friends and Influence People,4.13,282623.0
3,2,9296,5,41929,392,4887.0,465016901.0,9780465000000.0,Das Drama des begabten Kindes und die Suche na...,4.09,9563.0
4,2,2318,3,41929,398,998.0,671015206.0,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0


In [20]:
ratingsbooks_df['isbn'].isna().sum()

586521

In [21]:
#how many unique books?
len(ratingsbooks_df['book_id'].unique())


10000

In [22]:
#upload book data with desc

book_desc_df = pd.read_csv('book_data_w_desc.csv')

In [32]:
book_desc_df.head()

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,book_desc_lang
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,sw,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780440000000.0,870 pages,4.48,2041594,en,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,
2,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9780060000000.0,324 pages,4.27,3745197,no,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,"Modern Library Classics, USA / CAN",Paperback,9780680000000.0,279 pages,4.25,2453620,it,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,
4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9780320000000.0,498 pages,3.58,4281268,en,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,


In [23]:
import spacy
from spacy_langdetect import LanguageDetector

2022-03-27 15:50:59.352736: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-27 15:50:59.352795: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [26]:
#pip install spacy-transformers

In [25]:
#! python -m spacy download en

In [27]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)
text = 'This is an english text.'
doc = nlp(text)
print(doc._.language)

{'language': 'en', 'score': 0.9999976278118194}


In [36]:
#assign new column stating the language of the book desciption:
book_desc_df['book_desc_lang'] = ''
book_desc_df.iloc[:,1] = book_desc_df.iloc[:,1].astype(str) #making sure the descriptions are strings
for i in range(book_desc_df.shape[0]):
    book_desc_df.iloc[i,-1] =  nlp(book_desc_df.iloc[i,1][:20])._.language['language'] 
    print(i/book_desc_df.shape[0])

0.0
1.8415867111102927e-05
3.6831734222205854e-05
5.524760133330878e-05
7.366346844441171e-05
9.207933555551463e-05
0.00011049520266661756
0.0001289110697777205
0.00014732693688882342
0.00016574280399992634
0.00018415867111102926
0.0002025745382221322
0.0002209904053332351
0.00023940627244433804
0.000257822139555441
0.0002762380066665439
0.00029465387377764683
0.00031306974088874973
0.0003314856079998527
0.0003499014751109556
0.00036831734222205853
0.0003867332093331614
0.0004051490764442644
0.00042356494355536733
0.0004419808106664702
0.0004603966777775732
0.00047881254488867607
0.000497228411999779
0.000515644279110882
0.0005340601462219849
0.0005524760133330878
0.0005708918804441907
0.0005893077475552937
0.0006077236146663966
0.0006261394817774995
0.0006445553488886025
0.0006629712159997054
0.0006813870831108083
0.0006998029502219112
0.0007182188173330142
0.0007366346844441171
0.00075505055155522
0.0007734664186663229
0.0007918822857774259
0.0008102981528885288
0.0008287140199996317

In [37]:
book_desc_df.head()

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,book_desc_lang
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,sw,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,sw
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780440000000.0,870 pages,4.48,2041594,en,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,en
2,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9780060000000.0,324 pages,4.27,3745197,no,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,en
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,"Modern Library Classics, USA / CAN",Paperback,9780680000000.0,279 pages,4.25,2453620,it,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,it
4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9780320000000.0,498 pages,3.58,4281268,en,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,en


In [38]:
#correcting a mistake
#temp_df = pd.read_csv('book_data_w_desc.csv')

In [40]:
#book_desc_df['book_review_count'] = temp_df['book_review_count']

In [41]:
book_desc_df

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,book_desc_lang
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9.78044E+12,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,sw
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9.78044E+12,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,en
2,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9.78006E+12,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,en
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,"Modern Library Classics, USA / CAN",Paperback,9.78068E+12,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,it
4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9.78032E+12,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54296,Howard Megdal,"In this fearless and half-crazy story, Howard ...",,Hardcover,9.78161E+12,256 pages,3.37,27,9,Taking the Field: A Fan's Quest to Run the Tea...,Sports|Baseball|Sports and Games|Sports|Nonfic...,https://images.gr-assets.com/books/1312074392l...,en
54297,Howard Megdal,From the icons of the game to the players who ...,,Hardcover,9.78006E+12,256 pages,3.97,34,5,"The Baseball Talmud: Koufax, Greenberg, and th...",Nonfiction|Sports and Games|Sports,https://images.gr-assets.com/books/1348841629l...,en
54298,Howard Megdal,,,Kindle Edition,,,3.66,32,3,"Wilpon's Folly - The Story of a Man, His Fortu...",Sports|Baseball|Abandoned,https://images.gr-assets.com/books/1394277097l...,tl
54299,Mimi Baird|Eve Claxton,"Soon to be a major motion picture, from Brad P...",,Hardcover,9.7808E+12,272 pages,3.82,867,187,He Wanted the Moon: The Madness and Medical Ge...,Nonfiction|Autobiography|Memoir|Biography|Psyc...,https://images.gr-assets.com/books/1403192135l...,en


In [42]:
#remove rows that have the value of book_desc_lang != 'en'
book_desc_df = book_desc_df[book_desc_df['book_desc_lang']=='en']

In [43]:
book_desc_df.shape

(37675, 13)

In [44]:
# split genres into a list
book_desc_df['genres'] = book_desc_df['genres'].str.split('|')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_desc_df['genres'] = book_desc_df['genres'].str.split('|')


In [45]:
#split book authors into a list
book_desc_df['book_authors'] = book_desc_df['book_authors'].str.split('|')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_desc_df['book_authors'] = book_desc_df['book_authors'].str.split('|')


In [46]:
book_desc_df = book_desc_df.iloc[:,[0,1,4,9,10]]

In [47]:
book_desc_df.head()

Unnamed: 0,book_authors,book_desc,book_isbn,book_title,genres
1,"[J.K. Rowling, Mary GrandPré]",There is a door at the end of a silent corrido...,9780440000000.0,Harry Potter and the Order of the Phoenix,"[Fantasy, Young Adult, Fiction]"
2,[Harper Lee],The unforgettable novel of a childhood in a sl...,9780060000000.0,To Kill a Mockingbird,"[Classics, Fiction, Historical, Historical Fic..."
4,[Stephenie Meyer],About three things I was absolutely positive.F...,9780320000000.0,Twilight,"[Young Adult, Fantasy, Romance, Paranormal, Va..."
6,"[C.S. Lewis, Pauline Baynes]","Journeys to the end of the world, fantastic cr...",9780070000000.0,The Chronicles of Narnia,"[Fantasy, Classics, Fiction, Young Adult, Chil..."
8,[Margaret Mitchell],Gone with the Wind is a novel written by Marga...,9780450000000.0,Gone with the Wind,"[Classics, Historical, Historical Fiction, Fic..."


In [48]:
#remove rows with any na values
book_desc_df = book_desc_df.dropna(subset = ['book_isbn'],axis='index') 

In [49]:
book_desc_df.shape

(30484, 5)

In [50]:
# Preprocessing of book descriptions

In [51]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vkanoria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## tokenize the words

In [52]:
book_desc_df['book_desc'] = book_desc_df['book_desc'].astype(str)

In [53]:
book_desc_df['book_desc_tok'] = book_desc_df['book_desc'].progress_apply(nltk.word_tokenize)

  0%|          | 0/30484 [00:00<?, ?it/s]

In [54]:
book_desc_df.head()

Unnamed: 0,book_authors,book_desc,book_isbn,book_title,genres,book_desc_tok
1,"[J.K. Rowling, Mary GrandPré]",There is a door at the end of a silent corrido...,9780440000000.0,Harry Potter and the Order of the Phoenix,"[Fantasy, Young Adult, Fiction]","[There, is, a, door, at, the, end, of, a, sile..."
2,[Harper Lee],The unforgettable novel of a childhood in a sl...,9780060000000.0,To Kill a Mockingbird,"[Classics, Fiction, Historical, Historical Fic...","[The, unforgettable, novel, of, a, childhood, ..."
4,[Stephenie Meyer],About three things I was absolutely positive.F...,9780320000000.0,Twilight,"[Young Adult, Fantasy, Romance, Paranormal, Va...","[About, three, things, I, was, absolutely, pos..."
6,"[C.S. Lewis, Pauline Baynes]","Journeys to the end of the world, fantastic cr...",9780070000000.0,The Chronicles of Narnia,"[Fantasy, Classics, Fiction, Young Adult, Chil...","[Journeys, to, the, end, of, the, world, ,, fa..."
8,[Margaret Mitchell],Gone with the Wind is a novel written by Marga...,9780450000000.0,Gone with the Wind,"[Classics, Historical, Historical Fiction, Fic...","[Gone, with, the, Wind, is, a, novel, written,..."


## Functions to clean the text:

In [55]:
# Remove non-ascii characters
def _removeNonAscii(s):
    s = str(s)
    return "".join(i for i in s if ord(i)<128)
 #ord() removes the Unicode code point for a one-character string

In [56]:
def make_lower_case(text):
  return text.lower()

In [57]:
def remove_punctuation(text):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for element in text:
    if element in punc:
      text = text.replace(element, " ")
  return text

In [58]:
book_desc_df['book_desc_tok'] = book_desc_df['book_desc_tok'].apply(_removeNonAscii)
book_desc_df['book_desc_tok'] = book_desc_df['book_desc_tok'].apply(func = make_lower_case)
book_desc_df['book_desc_tok'] = book_desc_df['book_desc_tok'].apply(func=remove_punctuation)

In [59]:
book_desc_df.head()

Unnamed: 0,book_authors,book_desc,book_isbn,book_title,genres,book_desc_tok
1,"[J.K. Rowling, Mary GrandPré]",There is a door at the end of a silent corrido...,9780440000000.0,Harry Potter and the Order of the Phoenix,"[Fantasy, Young Adult, Fiction]",there is a door at the end...
2,[Harper Lee],The unforgettable novel of a childhood in a sl...,9780060000000.0,To Kill a Mockingbird,"[Classics, Fiction, Historical, Historical Fic...",the unforgettable novel of a ...
4,[Stephenie Meyer],About three things I was absolutely positive.F...,9780320000000.0,Twilight,"[Young Adult, Fantasy, Romance, Paranormal, Va...",about three things i was abso...
6,"[C.S. Lewis, Pauline Baynes]","Journeys to the end of the world, fantastic cr...",9780070000000.0,The Chronicles of Narnia,"[Fantasy, Classics, Fiction, Young Adult, Chil...",journeys to the end of the ...
8,[Margaret Mitchell],Gone with the Wind is a novel written by Marga...,9780450000000.0,Gone with the Wind,"[Classics, Historical, Historical Fiction, Fic...",gone with the wind is a no...


In [60]:
#join ratingsbooks_df and book_desc_df

final_df = pd.merge(ratingsbooks_df , book_desc_df, left_on='original_title', right_on = "book_title", how="inner", sort=False)


In [61]:
final_df.head()

Unnamed: 0,user_id,book_id,rating,SortIndex1,SortIndex2,goodreads_book_id,isbn,isbn13,original_title,average_rating,ratings_count,book_authors,book_desc,book_isbn,book_title,genres,book_desc_tok
0,2,2318,3,41929,398,998.0,671015206,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0,"[Thomas J. Stanley, William D. Danko]",The incredible national bestseller that is cha...,9780670000000.0,The Millionaire Next Door: The Surprising Secr...,"[Economics, Finance, Nonfiction, Business, Fin...",the incredible national bestseller ...
1,953,2318,5,31784,108,998.0,671015206,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0,"[Thomas J. Stanley, William D. Danko]",The incredible national bestseller that is cha...,9780670000000.0,The Millionaire Next Door: The Surprising Secr...,"[Economics, Finance, Nonfiction, Business, Fin...",the incredible national bestseller ...
2,979,2318,2,47245,187,998.0,671015206,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0,"[Thomas J. Stanley, William D. Danko]",The incredible national bestseller that is cha...,9780670000000.0,The Millionaire Next Door: The Surprising Secr...,"[Economics, Finance, Nonfiction, Business, Fin...",the incredible national bestseller ...
3,793,2318,3,27030,58,998.0,671015206,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0,"[Thomas J. Stanley, William D. Danko]",The incredible national bestseller that is cha...,9780670000000.0,The Millionaire Next Door: The Surprising Secr...,"[Economics, Finance, Nonfiction, Business, Fin...",the incredible national bestseller ...
4,190,2318,4,19863,454,998.0,671015206,9780671000000.0,The Millionaire Next Door: The Surprising Secr...,4.0,43937.0,"[Thomas J. Stanley, William D. Danko]",The incredible national bestseller that is cha...,9780670000000.0,The Millionaire Next Door: The Surprising Secr...,"[Economics, Finance, Nonfiction, Business, Fin...",the incredible national bestseller ...


In [62]:
final_df.shape

(6863793, 17)

In [63]:
#how many unique books?
len(final_df['book_id'].unique())

4439

In [64]:
final_df.to_csv('final_books_dataset_lang_clean.csv')

In [65]:
#How many observations (rows) without book descriptions?

final_df['book_desc'].isna().sum()


0

In [66]:
final_df['book_desc'].shape

(6863793,)